In [40]:
from pymongo import MongoClient
import pandas as pd
import os
import json
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans

In [41]:
pip install sklearn

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\15125\anaconda3\python.exe -m pip install --upgrade pip' command.


In [42]:
client = MongoClient(port=27017)
client = MongoClient("mongodb://localhost:27017/")
mydb = client["MemeStock"]

In [43]:
def CreateDataframeFromMongoDB(database,collection):
    mycollection = database[collection]
    allrecord = mycollection.find()
    df = pd.DataFrame(list(allrecord))
    return(df)

In [44]:
SPY_Hourly_df = CreateDataframeFromMongoDB(mydb,"table_spystock")
TslaTweet_df = CreateDataframeFromMongoDB(mydb,"table_TSLA")
TSLA_Hourly_df = CreateDataframeFromMongoDB(mydb,"table_tslastock")

In [45]:
TslaTweet_df= pd.read_csv(Path("resources","#TSLA_tweet_7days_prior_hist_2021_11_7_perhour.csv"))
SPY_Hourly_df= pd.read_csv(Path("resources","SPY_stock_hist_hourly_2021_11_7.csv"))
TSLA_Hourly_df= pd.read_csv(Path("resources","TSLA_stock_hist_hourly_2021_11_7.csv"))

In [46]:
# condense DFs to only columns that will be require 
TSLA_Hourly_df=TSLA_Hourly_df[["date_UTC","Percent_Day_Change","Volume"]]
TslaTweet_df=TslaTweet_df[["date_UTC","tweet_counts"]]
SPY_Hourly_df=SPY_Hourly_df[["date_UTC","Percent_Day_Change","Volume"]]

In [47]:
#rename Columns
TSLA_Hourly_df=TSLA_Hourly_df.rename(columns={'Percent_Day_Change': 'Percent_Day_Change_TSLA', 'Volume': 'Volume_TSLA'})
SPY_Hourly_df=SPY_Hourly_df.rename(columns={'Percent_Day_Change': 'Percent_Day_Change_SPY', 'Volume': 'Volume_SPY'})
#make all DFs match d type for date_UTC
TSLA_Hourly_df['date_UTC']=TSLA_Hourly_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})
SPY_Hourly_df['date_UTC']=SPY_Hourly_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})
TslaTweet_df['date_UTC']=TslaTweet_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})

In [48]:

#create the 3 combine df to one DF, encoded our Y value
combined_all_df=TSLA_Hourly_df.merge(SPY_Hourly_df, how='left', on="date_UTC")
combined_all_df=combined_all_df.merge(TslaTweet_df, how='left', on="date_UTC")
combined_all_df=combined_all_df.dropna(axis=0, how='any')
combined_all_df["Percent_Day_Change_TSLA_encode"]=combined_all_df["Percent_Day_Change_TSLA"].apply(lambda row: 1 if row>0 else 0)
combined_all_df_encode=combined_all_df.drop(["Percent_Day_Change_TSLA"], axis = 1)
combined_all_df_encode=combined_all_df_encode.drop(["date_UTC"], axis = 1)

In [49]:
#create 2 new DFs to for more analysis
combined_TSLA_SPY_encode=combined_all_df_encode.drop(["tweet_counts"], axis = 1)
combined_TSLA_tweets_encode=combined_all_df_encode.drop(['Volume_SPY','Percent_Day_Change_SPY',"Volume_TSLA"], axis = 1)

In [50]:
combined_all_df.to_csv("resources/combined_all_df.csv")
combined_all_df_encode.to_csv("resources/combined_all_df_encode.csv")

In [51]:
#All data is in float, it can now be scaled
def logistical_model_report(dfencoded):
    data_scaler = StandardScaler()
    df = data_scaler.fit_transform(dfencoded)

    y = dfencoded["Percent_Day_Change_TSLA_encode"]
    X = dfencoded.drop(columns="Percent_Day_Change_TSLA_encode")


    #split data into train
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    #create logistical reg. model
    classifier = LogisticRegression(solver='lbfgs', random_state=1)

    # Train the data
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    #results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
    #matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return report

In [52]:
#All data is in float, it can now be scaled
def logistical_model_results(dfencoded):
    data_scaler = StandardScaler()
    df = data_scaler.fit_transform(dfencoded)

    y = dfencoded["Percent_Day_Change_TSLA_encode"]
    X = dfencoded.drop(columns="Percent_Day_Change_TSLA_encode")


    #split data into train
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    #create logistical reg. model
    classifier = LogisticRegression(solver='lbfgs', random_state=1)

    # Train the data
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
    #matrix = confusion_matrix(y_test, y_pred)
    #report = classification_report(y_test, y_pred)
    return results

In [53]:
combined_all_log_mod_results=logistical_model_results(combined_all_df_encode)
combined_TSL_SPY_log_mod_results=logistical_model_results(combined_TSLA_SPY_encode)
combined_tweet_log_mod_results=logistical_model_results(combined_TSLA_tweets_encode)


In [54]:
combined_all_log_mod_report=logistical_model_report(combined_all_df_encode)
combined_TSL_SPY_log_mod_report=logistical_model_report(combined_TSLA_SPY_encode)
combined_tweet_log_mod_report=logistical_model_report(combined_TSLA_tweets_encode)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
combined_all_log_mod_results

Unnamed: 0,Prediction,Actual
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
5,1,0
6,1,1
7,1,1
8,1,0


In [56]:
print(combined_all_log_mod_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.22      1.00      0.36         2

    accuracy                           0.22         9
   macro avg       0.11      0.50      0.18         9
weighted avg       0.05      0.22      0.08         9



In [57]:
print(combined_TSL_SPY_log_mod_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.22      1.00      0.36         2

    accuracy                           0.22         9
   macro avg       0.11      0.50      0.18         9
weighted avg       0.05      0.22      0.08         9



In [58]:
print(combined_tweet_log_mod_report)

              precision    recall  f1-score   support

           0       0.50      0.14      0.22         7
           1       0.14      0.50      0.22         2

    accuracy                           0.22         9
   macro avg       0.32      0.32      0.22         9
weighted avg       0.42      0.22      0.22         9



In [59]:
combined_all_df_encode

Unnamed: 0,Volume_TSLA,Percent_Day_Change_SPY,Volume_SPY,tweet_counts,Percent_Day_Change_TSLA_encode
14,14824885,-0.247726,13992445,22.0,0
15,7175951,0.043573,5841979,9.0,1
16,6567429,0.096896,3451053,12.0,1
17,7126538,-0.093542,3957074,27.0,1
18,5269389,-0.137178,3724726,31.0,0
19,5371085,0.170086,3669787,52.0,1
20,6931589,0.154558,8267663,83.0,1
21,20277111,0.217287,9882061,131.0,1
22,5034709,0.123587,5962493,99.0,0
23,3998159,-0.01191,5261373,237.0,0


In [60]:
combined_TSLA_tweets_encode

Unnamed: 0,tweet_counts,Percent_Day_Change_TSLA_encode
14,22.0,0
15,9.0,1
16,12.0,1
17,27.0,1
18,31.0,0
19,52.0,1
20,83.0,1
21,131.0,1
22,99.0,0
23,237.0,0
