In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
#from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [2]:
SPY_hourly_df= pd.read_csv(Path('resources/SPY_stock_hist_hourly_2021_11_7.csv'))
Tweet_TSLA_hourly_df = pd.read_csv(Path('resources/#TSLA_tweet_7days_prior_hist_2021_11_7_perhour.csv'))
TSLA_hourly_df = pd.read_csv(Path('resources/TSLA_stock_hist_hourly_2021_11_7.csv'))

In [3]:
# condense DFs to only columns that will be require 
TSLA_hourly_df=TSLA_hourly_df[["date_UTC","Percent_Day_Change","Volume"]]
Tweet_TSLA_hourly_df=Tweet_TSLA_hourly_df[["date_UTC","tweet_counts"]]
SPY_hourly_df=SPY_hourly_df[["date_UTC","Percent_Day_Change","Volume"]]

In [4]:
#rename Columns
TSLA_hourly_df= TSLA_hourly_df.rename(columns={'Percent_Day_Change': 'Percent_Day_Change_TSLA', 'Volume': 'Volume_TSLA'})
SPY_hourly_df=SPY_hourly_df.rename(columns={'Percent_Day_Change': 'Percent_Day_Change_SPY', 'Volume': 'Volume_SPY'})
#make all DFs match d type for date_UTC
TSLA_hourly_df['date_UTC']=TSLA_hourly_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})
SPY_hourly_df['date_UTC']=SPY_hourly_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})
Tweet_TSLA_hourly_df['date_UTC']=Tweet_TSLA_hourly_df['date_UTC'].astype({'date_UTC': 'datetime64[ns]'})

In [5]:
#create the 3 combine df to one DF, encoded our Y value
combined_all_df=TSLA_hourly_df.merge(SPY_hourly_df, how='left', on="date_UTC")
combined_all_df=combined_all_df.merge(Tweet_TSLA_hourly_df, how='left', on="date_UTC")
combined_all_df=combined_all_df.dropna(axis=0, how='any')
combined_all_df["Percent_Day_Change_TSLA_encode"]=combined_all_df["Percent_Day_Change_TSLA"].apply(lambda row: 1 if row>0 else 0)
combined_all_df_encode=combined_all_df.drop(["Percent_Day_Change_TSLA"], axis = 1)
combined_all_df_encode=combined_all_df_encode.drop(["date_UTC"], axis = 1)

In [6]:
#create 2 new DFs to for more analysis
combined_TSLA_SPY_encode=combined_all_df_encode.drop(["tweet_counts"], axis = 1)
combined_TSLA_tweets_encode=combined_all_df_encode.drop(['Volume_SPY','Percent_Day_Change_SPY',"Volume_TSLA"], axis = 1)

In [9]:
combined_all_df.to_csv("resources/combined_all_df.csv")
combined_all_df_encode.to_csv("resources/combined_all_df_encode.csv")

In [10]:
#All data is in float, it can now be scaled
def logistical_model_report(dfencoded):
    data_scaler = StandardScaler()
    df = data_scaler.fit_transform(dfencoded)

    y = dfencoded["Percent_Day_Change_TSLA_encode"]
    X = dfencoded.drop(columns="Percent_Day_Change_TSLA_encode")


    #split data into train
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    #create logistical reg. model
    classifier = LogisticRegression(solver='lbfgs', random_state=1)

    # Train the data
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    #results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
    #matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return report

In [11]:
#All data is in float, it can now be scaled
def logistical_model_results(dfencoded):
    data_scaler = StandardScaler()
    df = data_scaler.fit_transform(dfencoded)

    y = dfencoded["Percent_Day_Change_TSLA_encode"]
    X = dfencoded.drop(columns="Percent_Day_Change_TSLA_encode")


    #split data into train
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    #create logistical reg. model
    classifier = LogisticRegression(solver='lbfgs', random_state=1)

    # Train the data
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
    #matrix = confusion_matrix(y_test, y_pred)
    #report = classification_report(y_test, y_pred)
    return results

In [16]:
combined_all_log_mod_results=logistical_model_results(combined_all_df_encode)
combined_TSL_SPY_log_mod_results=logistical_model_results(combined_TSLA_SPY_encode)
combined_tweet_log_mod_results=logistical_model_results(combined_TSLA_tweets_encode)

In [18]:
combined_all_log_mod_report=logistical_model_report(combined_all_df_encode)
combined_TSL_SPY_log_mod_report=logistical_model_report(combined_TSLA_SPY_encode)
combined_tweet_log_mod_report=logistical_model_report(combined_TSLA_tweets_encode)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
combined_all_log_mod_results

Unnamed: 0,Prediction,Actual
0,0,1
1,0,1
2,0,0
3,0,0
4,0,0
5,0,1
6,0,0
7,0,0
8,0,1
9,0,0


In [15]:
print(combined_all_log_mod_report)

              precision    recall  f1-score   support

           0       0.56      1.00      0.72         9
           1       0.00      0.00      0.00         7

    accuracy                           0.56        16
   macro avg       0.28      0.50      0.36        16
weighted avg       0.32      0.56      0.40        16



In [19]:
print(combined_TSL_SPY_log_mod_report)

              precision    recall  f1-score   support

           0       0.56      1.00      0.72         9
           1       0.00      0.00      0.00         7

    accuracy                           0.56        16
   macro avg       0.28      0.50      0.36        16
weighted avg       0.32      0.56      0.40        16



In [20]:
print(combined_tweet_log_mod_report)

              precision    recall  f1-score   support

           0       0.56      1.00      0.72         9
           1       0.00      0.00      0.00         7

    accuracy                           0.56        16
   macro avg       0.28      0.50      0.36        16
weighted avg       0.32      0.56      0.40        16



In [21]:
combined_all_df_encode

Unnamed: 0,Volume_TSLA,Percent_Day_Change_SPY,Volume_SPY,tweet_counts,Percent_Day_Change_TSLA_encode
14,8598134.0,0.279358,9861088.0,90.0,0
15,3846857.0,-0.289279,7274648.0,49.0,0
16,3879996.0,0.132167,5452042.0,36.0,1
17,1960785.0,0.025752,3494109.0,27.0,0
18,2038975.0,-0.004289,6397705.0,27.0,0
...,...,...,...,...,...
93,5078236.0,0.075191,5078236.0,45.0,1
94,3597144.0,-0.109161,3597144.0,54.0,0
95,3937415.0,-0.031075,3937415.0,49.0,0
96,4898959.0,0.158616,4898959.0,35.0,1


In [22]:
combined_TSLA_tweets_encode

Unnamed: 0,tweet_counts,Percent_Day_Change_TSLA_encode
14,90.0,0
15,49.0,0
16,36.0,1
17,27.0,0
18,27.0,0
...,...,...
93,45.0,1
94,54.0,0
95,49.0,0
96,35.0,1
