In [4]:
!pip install yfinance



In [5]:
import warnings
warnings.filterwarnings('ignore')

import yfinance as yf
import pandas as pd
import numpy as np
import altair as alt
import project_functions2 as pf

from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# These top two lines are needed to produce altair plots on google colab
# Comment these two lines out if you are running locally
!pip install altair_data_server
alt.data_transformers.enable('data_server')

# These bottem two lines are needed tto produce altair plots on local machine
# Comment these two line out to run on google colab
#alt.renderers.enable('default')
#alt.data_transformers.enable('json')



DataTransformerRegistry.enable('data_server')

In [8]:
stock_list = ['AMZN', 'AAPL', 'FB','GOOG', 'MSFT', 'TSLA']
#stock_list = ['AMZN', 'AAPL', 'FB','GOOG', 'TSLA']
stock_objects = {}
for stock in stock_list:
    stock_objects[stock] = yf.Ticker(stock)

In [9]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')

In [10]:
stock_investing = {}
for key in stock_objects:
    stock_investing[key] = pd.read_csv('/content/drive/MyDrive/SENG474_Project/data/sentiment/investing_'+key+'_sentiment.csv')
    stock_investing[key].set_index('date', inplace=True)
stock_stocks = {}
for key in stock_objects:
    stock_stocks[key] = pd.read_csv('/content/drive/MyDrive/SENG474_Project/data/sentiment/stocks_'+key+'_sentiment.csv')
    stock_stocks[key].set_index('date', inplace=True)

In [None]:
#adf = 'AAPL'
#aapl_sentiment = pd.read_csv('/content/drive/MyDrive/SENG474_Project/data/sentiment/investing_'+adf+'_sentiment.csv')
#aapl_sentiment.set_index('date', inplace=True)
#aapl_sentiment

In [11]:
def combiner(stock_dfs):
    combine_df = None

    for key in stock_dfs:
        if combine_df is not None:
            combine_df = pd.concat([combine_df, stock_dfs[key]])
        else:
            combine_df = stock_dfs[key]

    combine_df.sort_values(by=['Date'], inplace=True)
    return combine_df


In [21]:
def combo_multi_linear_regressor(combine_df, split_time, stock_dfs):
    
    X = combine_df.iloc[:,:-1]
    y = combine_df.iloc[:,-1:]
    #X = stock_df['Days From IPO'].values.reshape(-1, 1)
    #y = stock_df['Close'].values.reshape(-1, 1)
    
    # Does train/Test Split on chosen time
    # Change the -50 to a differnt value to change split point
    split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
    X_train = X.head(split_mark)
    X_test = X.tail(len(combine_df) - split_mark)
    y_train = y.head(split_mark)
    y_test = y.tail(len(combine_df) - split_mark)
    
    stock_model = LinearRegression().fit(X_train, y_train)
    stock_close_pred = stock_model.predict(X_test)
    stock_train_pred = stock_model.predict(X_train)
    
    columns = []
    for key in stock_dfs:
      columns.append(key + ' Price')
      columns.append(key + ' Prediction')
    #columns = ['Apple Price', 'Apple Prediction', 
    #           'Amazon Price', 'Amazon Prediction',
    #           'Facebook Price', 'Facebook Prediction',
    #           'Google Price', 'Google Prediction',
    #           'Microsoft Price', 'Microsoft Prediction',
    #           'Tesla Price', 'Tesla Prediction']
    
    single_split_mark = int(len(stock_dfs[list(stock_dfs.keys())[0]])-split_time)
    results_df = pd.DataFrame(columns=columns, 
                              index=stock_dfs[list(stock_dfs.keys())[0]].tail(len(stock_dfs[list(stock_dfs.keys())[0]])-single_split_mark).index)

    i = 0
    for key in stock_dfs:
        single_split_mark = int(len(stock_dfs[key])-split_time)
        results_df[columns[i]] = stock_dfs[key].iloc[:,-1:].tail(len(stock_dfs[key]) - single_split_mark)
        results_df[columns[i+1]] = stock_model.predict(stock_dfs[key].iloc[:,:-1].tail(len(stock_dfs[key]) - single_split_mark))
        i += 2
        
    results_df.reset_index(inplace=True)
    results_df = results_df.melt('Date', var_name='Company', value_name='Price')
    # Plots Results
    line_plot = alt.Chart(results_df).mark_line().encode(
        x = 'Date',
        y = 'Price',
        color = 'Company'
    )
    
    
    
    train_score = r2_score(y_train, stock_train_pred)
    print("Training R2 Score: " + str(train_score))
    print()

    
    
    model_score = r2_score(y_test.dropna(), stock_close_pred[:len(y_test.dropna())])
    print()
    print("R2 Score: " + str(model_score))
    
    return line_plot

In [9]:
def neural_net(combine_df, split_time, stock_dfs):
    scaler = MinMaxScaler()
    X = combine_df.iloc[:,:-1]
    y = combine_df.iloc[:,-1:]
    #X = stock_df['Days From IPO'].values.reshape(-1, 1)
    #y = stock_df['Close'].values.reshape(-1, 1)
    
    # Does train/Test Split on last year
    # Change the -50 to a differnt value to change split point
    split_mark = int(len(combine_df)-(split_time*len(stock_dfs)))
    X_train = X.head(split_mark)
    X_test = X.tail(len(combine_df) - split_mark)
    y_train = y.head(split_mark)
    y_test = y.tail(len(combine_df) - split_mark)

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    stock_nn = MLPRegressor(hidden_layer_sizes = [500, 500, 500], random_state=45).fit(X_train_scaled, y_train)
    
    stock_nn_pred = stock_nn.predict(X_test_scaled)
    stock_nn_train_pred = stock_nn.predict(X_train_scaled)
    
    sell_line = stock_nn_pred*1.1
    buy_line = stock_nn_pred*0.9
    
    # Plots Results

    train_score = r2_score(y_train, stock_nn_train_pred)
    print("Training R2 Score: " + str(train_score))
    print()

    columns = []
    for key in stock_dfs:
      columns.append(key + ' Price')
      columns.append(key + ' Prediction')
    
    single_split_mark = int(len(stock_dfs[list(stock_dfs.keys())[0]])-split_time)
    results_df = pd.DataFrame(columns=columns, 
                              index=stock_dfs[list(stock_dfs.keys())[0]].tail(len(stock_dfs[list(stock_dfs.keys())[0]])-single_split_mark).index)

    i = 0
    for key in stock_dfs:
        single_split_mark = int(len(stock_dfs[key])-split_time)
        results_df[columns[i]] = stock_dfs[key].iloc[:,-1:].tail(len(stock_dfs[key]) - single_split_mark)
        curr_X_train = scaler.transform(stock_dfs[key].iloc[:,:-1].tail(len(stock_dfs[key]) - single_split_mark))
        results_df[columns[i+1]] = stock_nn.predict(curr_X_train)
        i += 2
        
    results_df.reset_index(inplace=True)
    results_df = results_df.melt('Date', var_name='Company', value_name='Price')
    # Plots Results
    line_plot = alt.Chart(results_df).mark_line().encode(
        x = 'Date',
        y = 'Price',
        color = 'Company'
    )
    
    model_score = r2_score(y_test.dropna(), stock_nn_pred[:len(y_test.dropna())])
    print("R2 Score: " + str(model_score))
    
    return line_plot

In [22]:
drop_list = [ 'Volume', 'Dividends', 'Stock Splits',
       '5 Day Open Mean', '5 Day High Mean', '5 Day Low Mean',
       '5 Day Close Mean', '5 Day Volume Mean', '5 Day Open Var',
       '5 Day High Var', '5 Day Low Var', '5 Day Close Var',
       '5 Day Volume Var', '5 Day Dt', 
       '10 Day Open Mean', '10 Day High Mean', '10 Day Low Mean',
       '10 Day Close Mean', '10 Day Volume Mean', '10 Day Open Var',
       '10 Day High Var', '10 Day Low Var', '10 Day Close Var',
       '10 Day Volume Var', '10 Day High', '10 Day Low', '10 Day Dt', 
             '20 Day Open Mean', '20 Day High Mean', '20 Day Low Mean',
       '20 Day Close Mean', '20 Day Volume Mean', '20 Day Open Var',
       '20 Day High Var', '20 Day Low Var', '20 Day Close Var',
       '20 Day Volume Var', '20 Day Dt']  

stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    #stock_dfs[key] = pf.percent_model_setup(stock_dfs[key])
    #stock_dfs[key].fillna(0, inplace=True)
    #stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    stock_dfs[key].drop(drop_list, axis=1, inplace=True)
    stock_dfs[key] = stock_dfs[key].merge(stock_investing[key], how='left', left_index=True, right_index=True)
    stock_dfs[key] = stock_dfs[key].merge(stock_stocks[key], how='left', left_index=True, right_index=True)
    stock_dfs[key].replace([np.inf, -np.inf], np.nan, inplace=True)
    stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_percent_change_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
combo_multi_linear_regressor(combine_df, 365, stock_dfs)

Training R2 Score: 0.008067557467393849


R2 Score: -0.11972854514989884


# Linear Regression With Sentiment

In [23]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    #stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    stock_dfs[key] = stock_dfs[key].merge(stock_investing[key], how='left', left_index=True, right_index=True)
    stock_dfs[key] = stock_dfs[key].merge(stock_stocks[key], how='left', left_index=True, right_index=True)
    stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
combo_multi_linear_regressor(combine_df, 365, stock_dfs)

Training R2 Score: 0.9978943545774106


R2 Score: 0.9936019908985843


In [24]:
drop_list = [ 'Volume', 'Dividends', 'Stock Splits',
       '5 Day Open Mean', '5 Day High Mean', '5 Day Low Mean',
       '5 Day Close Mean', '5 Day Volume Mean', '5 Day Open Var',
       '5 Day High Var', '5 Day Low Var', '5 Day Close Var',
       '5 Day Volume Var', '5 Day Dt', 
       '10 Day Open Mean', '10 Day High Mean', '10 Day Low Mean',
       '10 Day Close Mean', '10 Day Volume Mean', '10 Day Open Var',
       '10 Day High Var', '10 Day Low Var', '10 Day Close Var',
       '10 Day Volume Var', '10 Day High', '10 Day Low', '10 Day Dt', 
             '20 Day Open Mean', '20 Day High Mean', '20 Day Low Mean',
       '20 Day Close Mean', '20 Day Volume Mean', '20 Day Open Var',
       '20 Day High Var', '20 Day Low Var', '20 Day Close Var',
       '20 Day Volume Var', '20 Day Dt']  

stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    #stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    stock_dfs[key].drop(drop_list, axis=1, inplace=True)
    stock_dfs[key] = stock_dfs[key].merge(stock_investing[key], how='left', left_index=True, right_index=True)
    stock_dfs[key] = stock_dfs[key].merge(stock_stocks[key], how='left', left_index=True, right_index=True)
    stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
combo_multi_linear_regressor(combine_df, 365, stock_dfs)

Training R2 Score: 0.9984209469624274


R2 Score: 0.9964042745851137


# Linear Regression Without Sentiment

In [None]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    #stock_dfs[key] = stock_dfs[key].merge(stock_sentiments[key], how='left', left_index=True, right_index=True)
    #stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
combo_multi_linear_regressor(combine_df, 365, stock_dfs)

Training R2 Score: 0.9978704769839474


R2 Score: 0.9938578250597505


In [26]:
drop_list = [ 'Volume', 'Dividends', 'Stock Splits',
       '5 Day Open Mean', '5 Day High Mean', '5 Day Low Mean',
       '5 Day Close Mean', '5 Day Volume Mean', '5 Day Open Var',
       '5 Day High Var', '5 Day Low Var', '5 Day Close Var',
       '5 Day Volume Var', '5 Day Dt', 
       '10 Day Open Mean', '10 Day High Mean', '10 Day Low Mean',
       '10 Day Close Mean', '10 Day Volume Mean', '10 Day Open Var',
       '10 Day High Var', '10 Day Low Var', '10 Day Close Var',
       '10 Day Volume Var', '10 Day High', '10 Day Low', '10 Day Dt', 
             '20 Day Open Mean', '20 Day High Mean', '20 Day Low Mean',
       '20 Day Close Mean', '20 Day Volume Mean', '20 Day Open Var',
       '20 Day High Var', '20 Day Low Var', '20 Day Close Var',
       '20 Day Volume Var', '20 Day Dt', 'Golden Cross']  

stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    #stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    stock_dfs[key].drop(drop_list, axis=1, inplace=True)
    #stock_dfs[key] = stock_dfs[key].merge(stock_investing[key], how='left', left_index=True, right_index=True)
    #stock_dfs[key] = stock_dfs[key].merge(stock_stocks[key], how='left', left_index=True, right_index=True)
    #stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
combo_multi_linear_regressor(combine_df, 365, stock_dfs) # 0.9965168918076025

Training R2 Score: 0.9984078818377284


R2 Score: 0.9965236146443556


# Neural Net With Sentiment

In [None]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    stock_dfs[key] = stock_dfs[key].merge(stock_sentiments[key], how='left', left_index=True, right_index=True)
    stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
neural_net(combine_df, 365, stock_dfs)

Training R2 Score: 0.9983987586980567

R2 Score: 0.9952883298407202


# Neural Net Without Sentiment

In [None]:
stock_dfs = {}
for key in stock_objects:
    stock_dfs[key] = stock_objects[key].history(period='max')
    
for key in stock_dfs:
    stock_dfs[key] = pf.date_time_prep(stock_dfs[key]) 
    stock_dfs[key] = pf.rolling_aves(stock_dfs[key])
    #stock_dfs[key] = stock_dfs[key].merge(stock_sentiments[key], how='left', left_index=True, right_index=True)
    #stock_dfs[key].fillna(0, inplace=True)
    stock_dfs[key] = pf.future_close_setup(stock_dfs[key], 5)

combine_df = combiner(stock_dfs)
neural_net(combine_df, 365, stock_dfs)

Training R2 Score: 0.9963698746978116

R2 Score: 0.9922120382046735
