In [None]:
import requests
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from scipy import linalg
from sklearn.linear_model import Ridge
from pytrends.request import TrendReq
import os.path
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [None]:
# From https://stackoverflow.com/questions/50786266/writing-dictionary-of-dataframes-to-file
import csv
import pandas as pd

def save_dict_df(dictex, keys_filename="stock_keys.txt", data_folder="hourly_stock_data"):
    for key, val in dictex.items():
        val.to_csv("./"+data_folder+"/data_{}.csv".format(str(key)))

    with open(keys_filename, "w") as f: #saving keys to file
        f.write(str(list(dictex.keys())))

def load_dict_df(keys_filename="stock_keys.txt", data_folder="hourly_stock_data"):
    """Reading data from keys"""
    with open(keys_filename, "r") as f:
        keys = eval(f.read())

    dictex = {}    
    for key in keys:
        dictex[key] = pd.read_csv("./"+data_folder+"/data_{}.csv".format(str(key)), index_col=0)

    return dictex

In [None]:
companies = ['tesla', 'facebook', 'microsoft', 'amazon', 'google', 'uber', 'lyft', 'apple', 'snap']
key_terms = ['report', 'good', 'bad', 'up', 'down', 'stock']
company_symbol = ['TSLA', 'FB', 'MSFT', 'AMZN', 'GOOGL', 'UBER', 'LYFT', 'AAPL', 'SNAP']
stock_columns = ['open', 'high', 'low', 'close', 'volume']

In [None]:
#create kw_list 
kw_list = []
for c_name in companies:
    for k in key_terms:
        kw_list.append(c_name + " " + k)

In [None]:
# Get 3-month ago trend data. May need to run a couple times to make sure it doesn't time out. 
def getTrendDataDF(kw_list, pull_data=False, dates='today 3-m', pytrends_df_filename="hourly_pytrends.csv"):
    # Retrieve the cached trend data
    if not pull_data and os.path.isfile(pytrends_df_filename):
        df = pd.read_csv(pytrends_df_filename, index_col="date")
        
    else: # Construct the dataframe by making pytrends calls
        df = pd.DataFrame()
        data = {}
        #print(df.empty)
        pytrends = TrendReq(hl='en-US', tz=360)
        sleep = True
        for kw in kw_list:
            if 'apple' in kw and sleep:
                time.sleep(60)
                sleep = False
            df_temp = pytrends.get_historical_interest([kw], year_start=2019, month_start=11, day_start=25, hour_start=0, year_end=2019, month_end=11, day_end=29, hour_end=23)
            df_temp = df_temp.drop(['isPartial'], axis=1)
            # print(kw)
            data[kw] = df_temp
        for kw in kw_list:
            if df.empty:
                df = data[kw]
            else:
                df = df.join(data[kw])
        df.to_csv(pytrends_df_filename)
    #reverse df rows
    df = df.iloc[::-1]
    return df

In [None]:
# Get Stock Hourly info for past 7 days
def getHourlyStockInfoDict(company_symbol, pull_data=False, stock_df_filename="stock_keys.txt"):
    if not pull_data and os.path.isfile(stock_df_filename):
        dict_stocks = load_dict_df(keys_filename=stock_df_filename);
    else :
        ts = 'TIME_SERIES_INTRADAY'
        api_key = ''
        interval = '30min'
        outputsize = 'full'
        dict_stocks = {}
        for i, symbol in enumerate(company_symbol):
            link = 'https://www.alphavantage.co/query?function={}&symbol={}&interval={}&apikey={}&outputsize={}'\
                .format(ts, symbol, interval, api_key, outputsize)
            request = requests.get(link)
            data = json.loads(request.text)
            # API only give you 5 requests per 5 mins, so break out if response is not what is expected.
            while "Time Series (30min)" not in data:
                print('...sleeping...')
                time.sleep(70)
                link = 'https://www.alphavantage.co/query?function={}&symbol={}&interval={}&apikey={}&outputsize={}'\
                    .format(ts, symbol, interval, api_key, outputsize)
                request = requests.get(link)
                data = json.loads(request.text)
            stock_data_per_hour = json.dumps(data["Time Series (30min)"])
            df_temp = pd.read_json(stock_data_per_hour).transpose()
            df_temp.reset_index(level=0, inplace=True)
            df_temp.columns = ['times', 'open', 'high', 'low', 'close', 'volume']
            dict_stocks[companies[i]] = df_temp
        save_dict_df(dict_stocks, keys_filename=stock_df_filename)
    return dict_stocks

In [None]:
def getTrends(company_name, df_trends):
    # Filters the trends
    company_keywords =  [x for x in list(df_trends.columns.values) if company_name in x]
    return df_trends[company_keywords].copy()
    

In [None]:
def getJointTimes(df_stock, df_trends): 
    stock_times = df_stock['times']
    trends_times = list(df_trends.index)
    return list(set(stock_times) & set(trends_times)) 

In [None]:
def getOverlapDateData(company_name, dict_stocks, df_trends): 
    new_df_trends = getTrends(company_name, df_trends)
    
    joint_times = getJointTimes(dict_stocks[company_name], new_df_trends)
    new_df_stocks = dict_stocks[company_name].loc[dict_stocks[company_name]['times'].isin(joint_times)]
    new_df_stocks = new_df_stocks.reset_index() 
    
    new_df_trends = new_df_trends.loc[new_df_trends.index.isin(joint_times)]
    new_df_trends = new_df_trends.reset_index()
    new_df_trends.columns = ['_'.join(x.split()) for x in list(new_df_trends.columns) if len(x) > 1]
    df_all_data = new_df_stocks.join(new_df_trends)
    return df_all_data


In [None]:
def trainMultipleRegression(y_col, parameter_cols, training_data): 
    formula = y_col + ' ~ 1'
    for i in parameter_cols: 
        formula = formula + " + " + i

    result = sm.formula.ols(formula=formula, data=training_data).fit()
    return result

In [None]:
def plotMultipleRegressionAndGetMSE(company, result, y_col, cols, all_data, training_data, testing_data):
    y_pred = result.predict()
    plt.plot(range(len(all_data[y_col])) , all_data[y_col])
    plt.plot(range(19), result.predict(training_data[cols]), '-')
    plt.plot(range(19, len(df_all_data[y_col])), result.predict(testing_data[cols]), '-')
    plt.xlabel('Day')
    plt.ylabel('Output $ amount')
    plt.title(company + ' Multiple Regression: ' + str(y_col))
    plt.legend(["True Data","Training Data - Price change prediction", "Testing Data - Price change prediction"])
    plt.show()
    
    MSE = mean_squared_error(result.predict(testing_data[cols]), testing_data[y_col]) / (np.mean(testing_data[y_col]) * np.std(testing_data[y_col]))
    return MSE

In [None]:
def getBestAlphaRidgeRegression(y_col, parameter_cols, df_all_data_train):
    df_all_data_test = df_all_data_train[15:]
    df_all_data_train = df_all_data_train[:15]
    
    X = df_all_data_train[parameter_cols]
    y = df_all_data_train[y_col]
    alpha = []
    MSE_train = []
    MSE_test = []
    for i in range(90, 2000, 10):
        clf = Ridge(alpha=i)
        clf.fit(X, y) 
        alpha.append(i)
        MSE_train.append(mean_squared_error(clf.predict(df_all_data_train[cols]), df_all_data_train[y_col]))
        MSE_test.append(mean_squared_error(clf.predict(df_all_data_test[cols]), df_all_data_test[y_col]))

    bestAlpha = alpha[MSE_test.index(min(MSE_test))]
    
    print("alpha: "+str(bestAlpha))
    print("Training error = "+str(mean_squared_error(clf.predict(df_all_data_train[cols]), df_all_data_train[y_col])))
    print("Testing error = "+str(mean_squared_error(clf.predict(df_all_data_test[cols]), df_all_data_test[y_col])))
    print()
    return bestAlpha, clf

In [None]:
#ridge regression

def trainAndPlotRidgeRegressionAndReturnMSE(company, y_col, parameter_cols, df_all_data_train, df_all_data_test) : 
    bestAlpha, clf = getBestAlphaRidgeRegression(y_col, parameter_cols, df_all_data_train)
    
    X = df_all_data_train[parameter_cols]
    y = df_all_data_train[y_col]
    
    #plot data with training and test data
    clf = Ridge(alpha=bestAlpha)
    clf.fit(X, y) 
    plt.plot(range(len(df_all_data[y_col])) , df_all_data[y_col])
    plt.plot(range(19), clf.predict(df_all_data_train[cols]), '-')
    plt.plot(range(19, len(df_all_data[y_col])), clf.predict(df_all_data_test[cols]), '-')
    plt.xlabel('Day')
    plt.ylabel('Output $ amount')
    plt.title(company + ' Ridge Regression: ' + str(y_col))
    plt.legend(["True Data","Training Data - Price change prediction", "Testing Data - Price change prediction"])
    plt.show()
    
    print(clf.get_params())
    MSE = mean_squared_error(clf.predict(df_all_data_test[cols]), df_all_data_test[y_col]) / (np.mean(df_all_data_test[y_col]) * np.std(df_all_data_test[y_col]))
    return MSE

In [None]:
def getMovingAvgAndStdDev(col_name, num_days, df_all_data):
    # Moving average and stdev past X days
    col_movingAvg = []
    col_stdev = []
    
    num_days_to_average = 10
    for i in range(len(df_all_data)):
        sum_to_avg = 0
        nums = []
        num_to_avg = min(num_days_to_average, len(df_all_data) - i) - 1
        for j in range(1, num_to_avg):
            sum_to_avg += df_all_data[col][i + j]
            nums.append(df_all_data[col][i + j])
        avg = sum_to_avg / (num_to_avg if num_to_avg > 0 else 1)
        stdev = np.std((nums if nums else [0]))

        col_movingAvg.append(avg)
        col_stdev.append(stdev)
    return col_movingAvg, col_stdev

In [None]:
def getAllParamCols(df_all_data):
    cols = list(df_all_data.columns)
    cols.remove('profit')
    cols.remove('open')
    cols.remove('close')
    cols.remove('high')
    cols.remove('low')
    cols.remove('times')
    cols.remove('date')
    cols.remove('index')
    cols.remove('volume')
    cols.remove('mid')
    return cols

## Get data

In [None]:
# Get trend data
df_trends = getTrendDataDF(kw_list)

In [None]:
# Get stock data
dict_stocks = getHourlyStockInfoDict(company_symbol)

## Predicting with all predictors, and only moving averages, std

In [None]:
MSE_dict = {} # {"company":{"multipleRegressionProfit":value, "multipleRegressionCost":value, etc}}
# Join data together, train various models
for company in companies[:1]: 
    print(company)
    MSE_dict[company] = {}
    # Get specific company data
    df_all_data = getOverlapDateData(company, dict_stocks, df_trends)  
    
    
    # Add more data columns
    df_all_data['profit'] = df_all_data['open']-df_all_data['close']
    df_all_data['mid'] = (df_all_data['high']+df_all_data['low'])/2
    
    # Trends cols (everything except these cols that are removed)
    cols = getAllParamCols(df_all_data)
    
    # Include moving average, stdev, and prev. 10 days. 
    num_days_to_average = 10
    col = 'profit'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_all_data)
    df_all_data[col+'_movingAvg'] = col_movingAvg
    df_all_data[col+'_stdev'] = col_stdev
    col_prev = [df_all_data[col][i+1] for i in range(len(df_all_data) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_all_data[col+'_prev'] = col_prev
    
    col = 'mid'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_all_data)
    df_all_data[col+'_movingAvg'] = col_movingAvg
    df_all_data[col+'_stdev'] = col_stdev
    col_prev = [df_all_data[col][i+1] for i in range(len(df_all_data) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_all_data[col+'_prev'] = col_prev
    
    col = 'volume'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_all_data)
    df_all_data[col+'_movingAvg'] = col_movingAvg
    df_all_data[col+'_stdev'] = col_stdev
    col_prev = [df_all_data[col][i+1] for i in range(len(df_all_data) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_all_data[col+'_prev'] = col_prev
    
    # Include movingAvg, stdev, and prev for all the Trends cols
    for col in cols: 
        # Prev info
        col_prev = [df_all_data[col][i+1] for i in range(len(df_all_data) - 1)]
        col_prev.append(0) # Append this so we can have 0 padding
        df_all_data[col+'_prev'] = col_prev
        
        col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_all_data)
        
        df_all_data[col+'_movingAvg'] = col_movingAvg
        df_all_data[col+'_stdev'] = col_stdev

    
    cols = getAllParamCols(df_all_data)
        
    # Reverse index order
    df_all_data = df_all_data.iloc[::-1] 
    
    # Split into training and testing data
    df_all_data_train = df_all_data[:19]
    df_all_data_test = df_all_data[19:]
    
    MSE_dict[company]["MultipleRegression"] = {}
    MSE_dict[company]["RidgeRegression"] = {}
    parameter_type = "allPredictors"
    MSE_dict[company]["MultipleRegression"][parameter_type] = {}
    MSE_dict[company]["RidgeRegression"][parameter_type] = {}
                         
    # Multiple Regression, All Predictors
    y_col='mid'
    close_multiple_regression_res = trainMultipleRegression(y_col, cols, df_all_data_train)
    print(close_multiple_regression_res.summary())
    MSE = plotMultipleRegressionAndGetMSE(company, close_multiple_regression_res, y_col, cols, df_all_data, df_all_data_train, df_all_data_test)
    MSE_dict[company]["MultipleRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)
    
    
    # RIDGE REGRESSION REQUIRES STANDARDIZATION
    # Standardize the input parameters
    df_standardized_data = df_all_data.copy()

    for col in cols: 
        df_standardized_data[col] = df_standardized_data[col] / np.std(df_standardized_data[col].values)
    
    # Split into training and testing data
    df_all_data_train = df_standardized_data[:19]
    df_all_data_test = df_standardized_data[19:]
    
    # Ridge Regression, All Predictors
    MSE = trainAndPlotRidgeRegressionAndReturnMSE(company, y_col, cols, df_all_data_train, df_all_data_test)
    MSE_dict[company]["RidgeRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)
    
    
    
    
    
    #### Predictions using only stock market data #### 
    # Remove any cols in col that has the company name in it. 
    print("No Trend Data")
    cols_noTrends = [col for col in cols if company not in col]
    cols = cols_noTrends
    
    # Split into training and testing data
    df_all_data_train = df_all_data[:19]
    df_all_data_test = df_all_data[19:]
    
    parameter_type = "noTrend"
    MSE_dict[company]["MultipleRegression"][parameter_type] = {}
    MSE_dict[company]["RidgeRegression"][parameter_type] = {}
    
    # Multiple Regression
    close_multiple_regression_res = trainMultipleRegression(y_col, cols, df_all_data_train)
    print(close_multiple_regression_res.summary())
    MSE = plotMultipleRegressionAndGetMSE(company, close_multiple_regression_res, y_col, cols, df_all_data, df_all_data_train, df_all_data_test)
    MSE_dict[company]["MultipleRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)
    
    # Standardize the input parameters
    df_standardized_data = df_all_data.copy()

    for col in cols: 
        df_standardized_data[col] = df_standardized_data[col] / np.std(df_standardized_data[col].values)
    
    # Split into training and testing data
    df_all_data_train = df_standardized_data[:19]
    df_all_data_test = df_standardized_data[19:]
    
    # Ridge Regression
    MSE = trainAndPlotRidgeRegressionAndReturnMSE(company, y_col, cols, df_all_data_train, df_all_data_test)
    MSE_dict[company]["RidgeRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)
    

## Predicting with only Google Trends data

In [None]:
# Join data together, train various models
for company in companies[:1]: 
    print(company)
    # Get specific company data
    df_all_data = getOverlapDateData(company, dict_stocks, df_trends)  
    
    
    # Add more data columns
    df_all_data['profit'] = df_all_data['open']-df_all_data['close']
    df_all_data['mid'] = (df_all_data['high']+df_all_data['low'])/2
    
    # Trends cols (everything except these cols that are removed)
    cols = getAllParamCols(df_all_data)
    
    num_days_to_average = 10
    for col in cols: 
        # Prev to today increment amount
        col_prev = [df_all_data[col][i+1] for i in range(len(df_all_data) - 1)]
        col_prev.append(0) # Append this so we can have 0 padding
        df_all_data[col+'_prev'] = col_prev
        
        col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_all_data)
        
        df_all_data[col+'_movingAvg'] = col_movingAvg
        df_all_data[col+'_stdev'] = col_stdev

    
    cols = getAllParamCols(df_all_data)
        
    # Reverse index order
    df_all_data = df_all_data.iloc[::-1] 
    
    # Split into training and testing data
    df_all_data_train = df_all_data[:19]
    df_all_data_test = df_all_data[19:]
    
    
    parameter_type="onlyTrends"
    MSE_dict[company]["MultipleRegression"][parameter_type] = {}
    MSE_dict[company]["RidgeRegression"][parameter_type] = {}
    y_col='mid'
    
    # Profit prediction: 
    close_multiple_regression_res = trainMultipleRegression(y_col, cols, df_all_data_train)
    print(close_multiple_regression_res.summary())
    MSE = plotMultipleRegressionAndGetMSE(company, close_multiple_regression_res, y_col, cols, df_all_data, df_all_data_train, df_all_data_test)
    MSE_dict[company]["MultipleRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)
    
    # Standardize the input parameters
    df_standardized_data = df_all_data.copy()

    for col in cols: 
        df_standardized_data[col] = df_standardized_data[col] / np.std(df_standardized_data[col].values)
    
    # Split into training and testing data
    df_all_data_train = df_standardized_data[:19]
    df_all_data_test = df_standardized_data[19:]
    
    
    # Ridge Regression
    MSE = trainAndPlotRidgeRegressionAndReturnMSE(company, y_col, cols, df_all_data_train, df_all_data_test)
    MSE_dict[company]["RidgeRegression"][parameter_type][y_col] = MSE
    print("MSE:", MSE)


In [None]:
def plotMSE(y_col, model):
    for company in companies[:1]:
        plt.plot(["all predictors", "only trends", "no trends"], [MSE_dict[company][model]['allPredictors'][y_col],  MSE_dict[company][model]['onlyTrends'][y_col], MSE_dict[company][model]['noTrend'][y_col]], label=company)

    plt.xlabel("parameter types")
    plt.ylabel("MSE / mean stock price")
    plt.legend()
    plt.title(model + " " + y_col);

In [None]:
y_col = "mid"
model = "MultipleRegression"
plotMSE(y_col, model)

In [None]:
y_col = "mid"
model = "RidgeRegression"
plotMSE(y_col, model)

## Tensorflow + Keras time

In [None]:
# scale data to be between 0-1 
def predictCompany(df_temp, train_size, cols):
    # scale data to be between 0-1 including average
    sc = MinMaxScaler(feature_range = (0, 1))
    data_set_scaled = sc.fit_transform(df_temp[cols+['mid']])
    
    #split training data 
    train = data_set_scaled[:train_size, :]
    test = data_set_scaled[train_size:, :]
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    
    #create model
    model = Sequential()
    model.add(LSTM(train_size, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    
    #train model
    history = model.fit(train_X, train_y, epochs=500, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
    return history, model, train_X, train_y, test_X, test_y

In [None]:
def plotPrediction(company, train_size, model, train_X, train_y, test_X, test_y):
    plt.plot(range(len(train_y)+len(test_y)) , list(np.array(train_y))+list(np.array(test_y)), range(len(train_y)), model.predict(train_X), '-', range(len(train_y), len(train_y)+len(test_y)), model.predict(test_X), '-')
    plt.xlabel('Day')
    plt.ylabel('Profit from previous day')
    plt.title(company + ' - predicting stock market with Keras')
    plt.legend(["True Data","Training Data", "Testing Data"])
    plt.show()

In [None]:
def plotHistory(history):
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

In [None]:
#Trends only!
MSE_train_trends_only = []
MSE_test_trends_only = []
for company in companies: 
    print(company)
    # Get specific company data
    df_all_data = getOverlapDateData(company, dict_stocks, df_trends)  
    # Add more data columns
    df_all_data['profit'] = df_all_data['open']-df_all_data['close']
    df_all_data['mid'] = (df_all_data['high']+df_all_data['low'])/2
    
    # Trends cols (everything except these cols that are removed)
    cols = getAllParamCols(df_all_data)
    
    # Plotting only Google Trends data 
    history, model, train_X, train_y, test_X, test_y = predictCompany(df_all_data, 19, cols)
    plotHistory(history)
    plotPrediction(company, 19, model, train_X, train_y, test_X, test_y)
    MSE_train_trends_only.append((company, mean_squared_error(model.predict(train_X), train_y)))
    MSE_test_trends_only.append((company, mean_squared_error(model.predict(test_X), test_y)))


In [None]:
#Stock only!
MSE_train_stock_only = []
MSE_test_stock_only = []
for company in companies: 
    print(company)
    # Get specific company data
    df_all_data = getOverlapDateData(company, dict_stocks, df_trends)  
    # Add more data columns
    df_all_data['profit'] = df_all_data['open']-df_all_data['close']
    df_all_data['mid'] = (df_all_data['high']+df_all_data['low'])/2
    
    # Trends cols (everything except these cols that are removed)
    cols = stock_columns
    
    # Plotting only Google Trends data 
    history, model, train_X, train_y, test_X, test_y = predictCompany(df_all_data, 19, cols)
    plotHistory(history)
    plotPrediction(company, 19, model, train_X, train_y, test_X, test_y)
    MSE_train_stock_only.append((company, mean_squared_error(model.predict(train_X), train_y)))
    MSE_test_stock_only.append((company, mean_squared_error(model.predict(test_X), test_y)))


In [None]:
#both trends and stock!
MSE_train_both = []
MSE_test_both = []
for company in companies: 
    print(company)
    # Get specific company data
    df_all_data = getOverlapDateData(company, dict_stocks, df_trends)  
    # Add more data columns
    df_all_data['profit'] = df_all_data['open']-df_all_data['close']
    df_all_data['mid'] = (df_all_data['high']+df_all_data['low'])/2
    
    # Trends cols (everything except these cols that are removed)
    cols = getAllParamCols(df_all_data) + stock_columns
    
    # Plotting only Google Trends data 
    history, model, train_X, train_y, test_X, test_y = predictCompany(df_all_data, 19, cols)
    plotHistory(history)
    plotPrediction(company, 19, model, train_X, train_y, test_X, test_y)
    MSE_train_both.append((company, mean_squared_error(model.predict(train_X), train_y)))
    MSE_test_both.append((company, mean_squared_error(model.predict(test_X), test_y)))


In [None]:
#plot MSE_train data points as scatter plots
X = [x[0] for x in MSE_train_both]
trends_only = [x[1] for x in MSE_train_trends_only]
stock_only = [x[1] for x in MSE_train_stock_only]
both = [x[1] for x in MSE_train_both]
fig, ax = plt.subplots()
plt.plot(X,trends_only, '.', X, stock_only, '^', X, both, '.')
plt.xlabel('Companies')
plt.ylabel('MSE Train')
plt.title('MSE Train per company training on a Keras model')
ax.legend(['trends only', 'stock_only', 'both'])
plt.show()

In [None]:
#plot MSE_test data points as scatter plots
X = [x[0] for x in MSE_test_both]
trends_only = [x[1] for x in MSE_test_trends_only]
stock_only = [x[1] for x in MSE_test_stock_only]
both = [x[1] for x in MSE_test_both]
fig, ax = plt.subplots()
plt.plot(X,trends_only, '.', X, stock_only, '^', X, both, '.')
plt.xlabel('Companies')
plt.ylabel('MSE Test')
plt.title('MSE Test per company training on a Keras model')
ax.legend(['trends only', 'stock_only', 'both'])
plt.show()

# Lasso Regression with Hourly Data

In [None]:
# Imports
import requests
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from scipy import linalg
from sklearn import linear_model
from sklearn.linear_model import Ridge
from pytrends.request import TrendReq
import os.path


**Helper Functions**

In [None]:
def save_dict_df(dictex, keys_filename="keys.txt", data_folder="hourly_stock_data"):
    for key, val in dictex.items():
        val.to_csv("./"+data_folder+"/data_{}.csv".format(str(key)))

    with open(keys_filename, "w") as f: #saving keys to file
        f.write(str(list(dictex.keys())))

def load_dict_df(keys_filename="keys.txt", data_folder="hourly_stock_data"):
    """Reading data from keys"""
    with open(keys_filename, "r") as f:
        keys = eval(f.read())

    dictex = {}    
    for key in keys:
        dictex[key] = pd.read_csv("./"+data_folder+"/data_{}.csv".format(str(key)), index_col=0)

    return dictex

In [None]:
def getTrends(company_name, df_trends):
    # Filters the trends
    company_keywords =  [x for x in list(df_trends.columns.values) if company_name in x]
    return df_trends[company_keywords].copy()
    

In [None]:
def getJointTimes(df_stock, df_trends): 
    stock_times = df_stock['times']
    trends_times = list(df_trends.index)
    return list(set(stock_times) & set(trends_times)) 

In [None]:
# Define Company Keyterms and Symbols
companies = ['tesla', 'facebook', 'microsoft', 'amazon', 'google', 'uber', 'lyft','apple','snap']
key_terms = ['report', 'good', 'bad', 'up', 'down', 'stock']
company_symbol = ['TSLA', 'FB', 'MSFT', 'AMZN', 'GOOGL', 'UBER', 'LYFT','AAPL','SNAP']

In [None]:
#Create Key Word List
kw_list = []
for c_name in companies:
    for k in key_terms:
        kw_list.append(c_name + " " + k)

**Gather Google trends data**

In [None]:
# Get Hourly trends data from pytrends
# If there is a server 500 error, try changing the dates to this past week! 
def getHourlyTrends(company_symbol, pull_data=False, trends_df_filename="hourly_pytrends.csv"):
    if not pull_data and os.path.isfile(trends_df_filename):
        trends_df = pd.read_csv(trends_df_filename, index_col="date")
    else :
        df = pd.DataFrame()
        data = {}
        pytrends = TrendReq(hl='en-US', tz=360)
        for kw in kw_list:
            print(kw)
            df_temp = pytrends.get_historical_interest([kw], year_start=2019, month_start=11, day_start=27, hour_start=0, year_end=2019, month_end=12, day_end=3, hour_end=23, sleep=30)
            if 'isPartial' in df_temp.columns: 
                df_temp = df_temp.drop(['isPartial'], axis=1)
            data[kw] = df_temp
        for kw in kw_list:
            if df.empty:
                df = data[kw]
            else:
                df = df.join(data[kw])
                
        print("finished kw forloop")
        trends_df = df
        df.to_csv(trends_df_filename)
    return trends_df
        

In [None]:
df_trends = getHourlyTrends(company_symbol,True)

In [None]:
# Inspect Data
df_trends.head()

**Gather Stock Data: Alpha Vantage**

In [None]:
# Get Stock Hourly info for past 7 days
def getHourlyStockInfoDict(company_symbol, pull_data=False, stock_df_filename="stock_keys.txt"):
    if not pull_data and os.path.isfile(stock_df_filename):
        dict_stocks = load_dict_df(keys_filename=stock_df_filename);
    else :
        ts = 'TIME_SERIES_INTRADAY'
        api_key = ''
        interval = '30min'
        outputsize = 'full'
        dict_stocks = {}
        for i, symbol in enumerate(company_symbol):
            print(symbol)
            link = 'https://www.alphavantage.co/query?function={}&symbol={}&interval={}&apikey={}&outputsize={}'\
                .format(ts, symbol, interval, api_key, outputsize)
            request = requests.get(link)
            data = json.loads(request.text)
            # API only give you 5 requests per 5 mins, so break out if response is not what is expected.
            while "Time Series (30min)" not in data:
                print('...sleeping...')
                time.sleep(70)
                link = 'https://www.alphavantage.co/query?function={}&symbol={}&interval={}&apikey={}&outputsize={}'\
                    .format(ts, symbol, interval, api_key, outputsize)
                request = requests.get(link)
                data = json.loads(request.text)
            stock_data_per_hour = json.dumps(data["Time Series (30min)"])
            df_temp = pd.read_json(stock_data_per_hour).transpose()
            df_temp.reset_index(level=0, inplace=True)
            df_temp.columns = ['times', 'open', 'high', 'low', 'close', 'volume']
            dict_stocks[companies[i]] = df_temp
        save_dict_df(dict_stocks, keys_filename=stock_df_filename)
    return dict_stocks

In [None]:
df_stocks = getHourlyStockInfoDict(company_symbol,True)

In [None]:
# Gather stock data for each individual company
#df_stocks = {}
#for s in company_symbol:
#    print(s)
#    res = getIntraday1minDF(s)
    # data limit reached
#    while res.empty:
#        time.sleep(10)
#        res = getIntraday1minDF(s)
    # add stock information to dictionary
#    df_stocks[s] = res

In [None]:
# Preview Stock Data
df_stocks['tesla'].tail()

In [None]:
#reverse df rows
df_trends = df_trends.iloc[::-1]
df_trends.head()

**Merge Trend and Stock Data**

In [None]:
df_trends_stocks = {}
def cleanAndJoinData():
    for s,c in zip(company_symbol, companies):
        print(c)
        company_names = [x for x in list(df_trends.columns.values) if c in x]
        df_temp_trends = df_trends[company_names]
        
        # line up indexes 
        stock_times = list(df_stocks[c].times)
        trends_times = list(df_temp_trends.index)
        joint_times = list(set(stock_times) & set(trends_times)) 
        
        print(joint_times)
        df_temp_stocks = df_stocks[c].loc[df_stocks[c]['times'].isin(joint_times)]
        df_temp_stocks = df_temp_stocks.reset_index()
        df_temp_stocks = df_temp_stocks.iloc[::-1]
        df_temp_trends = df_temp_trends.loc[df_temp_trends.index.isin(joint_times)]
        df_temp_trends = df_temp_trends.reset_index()
        df_temp_trends.columns = ['_'.join(x.split()) for x in list(df_temp_trends.columns) if len(x) > 1]
        df_trends_stocks[c] = df_temp_stocks.join(df_temp_trends,lsuffix='_left', rsuffix='_right')
cleanAndJoinData()

**Function for calculating new predictors**

In [None]:
def getMovingAvgAndStdDev(col_name, num_days, df_all_data):
    # Moving average and stdev past X days
    col_movingAvg = []
    col_stdev = []
    
    num_days_to_average = 10
    for i in range(len(df_all_data)):
        sum_to_avg = 0
        nums = []
        num_to_avg = min(num_days_to_average, len(df_all_data) - i) - 1
        for j in range(1, num_to_avg):
            sum_to_avg += df_all_data[col][i + j]
            nums.append(df_all_data[col][i + j])
        avg = sum_to_avg / (num_to_avg if num_to_avg > 0 else 1)
        stdev = np.std((nums if nums else [0]))

        col_movingAvg.append(avg)
        col_stdev.append(stdev)
    return col_movingAvg, col_stdev

**Add Additional Predictors to model**

In [None]:
def getAllParamCols(df_all_data):
    cols = list(df_all_data.columns)
    cols.remove('open')
    cols.remove('close')
    cols.remove('high')
    cols.remove('low')
    cols.remove('times')
    cols.remove('date')
    cols.remove('index')
    cols.remove('volume')
    cols.remove('profit')
    cols.remove('mid')
    cols.remove('volume_movingAvg')
    cols.remove('volume_stdev')
    cols.remove('profit_movingAvg')
    cols.remove('profit_stdev')
    cols.remove('mid_movingAvg')
    cols.remove('mid_stdev')
    return cols

In [None]:
# Add predictors
for company in df_trends_stocks: 
    # Calculate Profit For Each Company in Dict
    (df_trends_stocks[company])['profit'] = (df_trends_stocks[company])['open']-(df_trends_stocks[company])['close']
    (df_trends_stocks[company])['mid'] = ((df_trends_stocks[company])['high']+(df_trends_stocks[company])['low'])/2
    # Calculate Moving Averages and Standard Deviation for Stock Data
    # For Profit
    num_days_to_average = 10
    col = 'profit'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_trends_stocks[company])
    (df_trends_stocks[company])[col+'_movingAvg'] = col_movingAvg
    (df_trends_stocks[company])[col+'_stdev'] = col_stdev
    col_prev = [(df_trends_stocks[company])[col][i+1] for i in range(len((df_trends_stocks[company])) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_trends_stocks[company][col+'_prev'] = col_prev
    # For Mid
    col = 'mid'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_trends_stocks[company])
    (df_trends_stocks[company])[col+'_movingAvg'] = col_movingAvg
    (df_trends_stocks[company])[col+'_stdev'] = col_stdev
    col_prev = [(df_trends_stocks[company])[col][i+1] for i in range(len((df_trends_stocks[company])) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_trends_stocks[company][col+'_prev'] = col_prev
    # For Volume
    col = 'volume'
    col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_trends_stocks[company])
    (df_trends_stocks[company])[col+'_movingAvg'] = col_movingAvg
    (df_trends_stocks[company])[col+'_stdev'] = col_stdev
    cols = getAllParamCols(df_trends_stocks[company])
    col_prev = [(df_trends_stocks[company])[col][i+1] for i in range(len((df_trends_stocks[company])) - 1)]
    col_prev.append(0) # Append this so we can have 0 padding
    df_trends_stocks[company][col+'_prev'] = col_prev
    
    # Calculate Increment Over time
    for col in cols: 
        # Prev 
        col_prev = [(df_trends_stocks[company])[col][i+1] for i in range(len((df_trends_stocks[company])) - 1)]
        col_prev.append(0) # Append this so we can have 0 padding
        #df_all_data[col+'_prev'] = col_prev
        (df_trends_stocks[company])[col+'_prev'] = col_prev
        col_movingAvg, col_stdev = getMovingAvgAndStdDev(col, num_days_to_average, df_trends_stocks[company])
        
        (df_trends_stocks[company])[col+'_movingAvg'] = col_movingAvg
        (df_trends_stocks[company])[col+'_stdev'] = col_stdev
    # Reverse index order
    df_trends_stocks[company] = df_trends_stocks[company].iloc[::-1] 

**LASSO Regression Helper Functions**

In [None]:
# Methods for LASSO Regression
def getBestAlphaLASSORegression(y_col, parameter_cols, df_all_data_train):
    split = int(df_all_data_train.shape[0]/2)
    df_all_data_test = df_all_data_train[split:]
    df_all_data_train = df_all_data_train[:split]
    
    X = df_all_data_train[parameter_cols]
    y = df_all_data_train[y_col]
    alpha = []
    MSE_train = []
    MSE_test = []
    for i in range(90, 10000, 10):
        clf = linear_model.Lasso(alpha=i)
        clf.fit(X, y) 
        alpha.append(i)
        MSE_train.append(mean_squared_error(clf.predict(df_all_data_train[cols]), df_all_data_train[y_col]))
        MSE_test.append(mean_squared_error(clf.predict(df_all_data_test[cols]), df_all_data_test[y_col]))

    bestAlpha = alpha[MSE_test.index(min(MSE_test))]
    bestMSE = min(MSE_test)
    print("alpha: "+str(bestAlpha))
    print("Training error = "+str(mean_squared_error(clf.predict(df_all_data_train[cols]), df_all_data_train[y_col])))
    print("Testing error = "+str(mean_squared_error(clf.predict(df_all_data_test[cols]), df_all_data_test[y_col])))
    print()
    return bestAlpha, bestMSE

In [None]:
# Define Functions For Plotting LASSO Regression
def plotLASSO(company, alpha, label, predictors, df): 
    # Separate Data
    split = int(df.shape[0]/2)
    df_train = df[:split]
    df_test = df[split:]
    X_train = df_train[predictors]
    Y_train = df_train[label]
    X_test = df_test[predictors]
    Y_test = df_test[label]
    Y_labels = df[label]
    # Initialize Model w/ Optimal Alpha
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(X_train, Y_train) 
    param_dict = dict(zip(clf.coef_, predictors))
    print("Parameter Estimates w/ LASSO : ",param_dict)
    # Make Predictions using optimal alpha value
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    # Plot test and train predictions against true labels
    
    fig1, ax1 = plt.subplots()
    ax1.plot(range(split), y_pred_train, color='r')
    ax1.plot(range(split, len(Y_labels)),y_pred_test, color='b')
    ax1.plot(range(len(Y_labels)),Y_labels, color='g')
    ax1.set_xlabel('Time in Hours')
    ax1.set_ylabel(label)
    ax1.set_title(label + ' Predictions with LASSO Regression for '+ company)

**Run regression and view dropped features for each company**

In [None]:
def columnsAll(company, label): 
    cols = df_trends_stocks[company].columns
    cols = list(cols)
    cols.remove('times')
    cols.remove('index')
    cols.remove('date')
    cols.remove(label)
    return cols

def columnsTrends(company, label): 
    cols = [x for x in list(df_trends_stocks[company].columns) if company in x]
    return cols

def columnsAverage(company, label): 
    cols = [x for x in list(df_trends_stocks[company].columns) if company not in x]
    cols.remove(label)
    cols.remove('times')
    cols.remove('index')
    cols.remove('date')
    cols.remove('open')
    cols.remove('volume')
    cols.remove('high')
    cols.remove('low')
    cols.remove('close')
    cols.remove('profit')
    return cols

# Plot Predictions w/ All Predictors

In [None]:
keys = ['All Predictors ', 'Just Trends ','Just Stock Data']
MSE = {key: None for key in keys}
MSE_companies = {company: MSE for company in companies}

In [None]:
for company in companies: 
    # Prepare column list
    cols = columnsAll(company, 'mid') 
    # Find best alpha for LASSO Regression
    alpha, MSE = getBestAlphaLASSORegression('mid',cols,df_trends_stocks[company])
    # Plot Predictions by True Labels
    plotLASSO(company, alpha, 'mid', cols, df_trends_stocks[company])
    # Plot MSE for LASSO Regression
    MSE_companies[company]['All Predictors']  = MSE

# Plot Predictions for Just Trends Data

In [None]:
for company in companies: 
    # Prepare column list
    cols = columnsTrends(company, 'mid') 
    # Find best alpha for LASSO Regression
    alpha = getBestAlphaLASSORegression('mid',cols,df_trends_stocks[company])
    # Plot Predictions by True Labels
    plotLASSO(company, alpha, 'mid', cols, df_trends_stocks[company])
    # Plot MSE train and test for LASSO Regression
    MSE_companies[company]['Just Trends']  = MSE

# Plot Predictions for Just Stock Data

In [None]:
for company in companies: 
    # Prepare column list
    cols = columnsAverage(company, 'mid') 
    # Find best alpha for LASSO Regression
    alpha = getBestAlphaLASSORegression('mid',cols,df_trends_stocks[company])
    # Plot Predictions by True Labels
    plotLASSO(company, alpha, 'mid', cols, df_trends_stocks[company])
    # Plot MSE train and test for LASSO Regression
    MSE_companies[company]['Just Stock Data']  = MSE

In [None]:
print(MSE_companies)