In [None]:
import requests
import time
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
from scipy import linalg
from sklearn.linear_model import Ridge
from pytrends.request import TrendReq
from pytrends import dailydata
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

In [None]:
# Define company variables
companies = ['tesla', 'facebook', 'microsoft', 'amazon', 'google', 'uber', 'lyft', 'apple', 'snap']
key_terms = ['report', 'good', 'bad', 'up', 'down', 'stock']
company_symbol = ['TSLA', 'FB', 'MSFT', 'AMZN', 'GOOGL', 'UBER', 'LYFT', 'AAPL', 'SNAP']
stock_columns = ['open', 'high', 'low', 'close', 'volume']

In [None]:
# Create Key Words List for Pytrends
kw_list = []
for c_name in companies:
    for k in key_terms:
        kw_list.append(c_name + " " + k)

In [None]:
# Get Hourly trends data from pytrends
# If there is a server 500 error, try changing the dates to this past week! 
df = pd.DataFrame()
data = {}
pytrends = TrendReq(hl='en-US', tz=360)
for kw in kw_list:
    print(kw)
    df_temp = pytrends.get_historical_interest([kw], year_start=2019, month_start=11, day_start=25, hour_start=0, year_end=2019, month_end=11, day_end=29, hour_end=23)
    if 'isPartial' in df_temp.columns: 
        df_temp = df_temp.drop(['isPartial'], axis=1)
    data[kw] = df_temp
for kw in kw_list:
    if df.empty:
        df = data[kw]
    else:
        df = df.join(data[kw])

In [None]:
#Sanity check on the data 
print(df.head())
print(df.shape)
print(df.index)
print(df.columns)

In [None]:
# worldTradingData_APIKey = ''
# def getWorldTradingData_Intraday1min(symbol, days=1, interval=1) :
#     # limits on the inputs https://www.worldtradingdata.com/documentation#stock-and-index-intraday
#     link = "https://intraday.worldtradingdata.com/api/v1/intraday?symbol={}&range={}&interval={}&api_token={}"\
#         .format(symbol, days, interval, worldTradingData_APIKey)
#     request = requests.get(link)
#     data = json.loads(request.text)
#     if 'intraday' not in data:
#         return pd.DataFrame()
#     stock_data = json.dumps(data["intraday"])
#     df = pd.read_json(stock_data).transpose()
#     cols = ['open', 'high', 'low', 'close', 'volume']
#     df = df[cols]
#     df.reset_index(level=0, inplace=True)
#     df.columns = ['times', 'open', 'high', 'low', 'close', 'volume']
#     return df

In [None]:
# # getting data from World Trading Data
# df_stocks = {}
# for s in company_symbol:
#     print(s)
#     res = getWorldTradingData_Intraday1min(s)
#     while res.empty:
#         time.sleep(10)
#         res = getWorldTradingData_Intraday1min(s)
#     df_stocks[s] = getWorldTradingData_Intraday1min(s)

In [None]:
# Gather Stock Data for each company and return dataframe 
ts = 'TIME_SERIES_INTRADAY'#'TIME_SERIES_DAILY'
interval = '30min'
api_key = '' 
outputsize = 'full' # compact= 100 results, full= all data (5 days?)
def getIntraday1minDF(symbol): 
    link = 'https://www.alphavantage.co/query?function={}&symbol={}&interval={}&apikey={}&outputsize={}'\
        .format(ts, symbol, interval, api_key, outputsize)
    request = requests.get(link)
    data = json.loads(request.text)
    if "Time Series (30min)" not in data:
        print("data limit reached")
        return pd.DataFrame()
    stock_data = json.dumps(data["Time Series (30min)"])
    df = pd.read_json(stock_data).transpose()
    cols = ['1. open', '2. high', '3. low', '4. close', '5. volume']
    df = df[cols]
    df.reset_index(level=0, inplace=True)
    df.columns = ['times', 'open', 'high', 'low', 'close', 'volume']
    return df

In [None]:
# Gather stock data for each individual company
df_stocks = {}
for s in company_symbol:
    print(s)
    res = getIntraday1minDF(s)
    # data limit reached
    while res.empty:
        time.sleep(10)
        res = getIntraday1minDF(s)
    # add stock information to dictionary
    df_stocks[s] = res

In [None]:
df_stocks['TSLA'].head()

In [None]:
# join Google Trends Data with Stock Market Data
df_trends_stocks = {}
def cleanAndJoinData():
    for s,c in zip(company_symbol, companies):
        print(c)
        company_names = [x for x in list(df.columns.values) if c in x]
        df_temp_trends = df[company_names]
        
        # line up indexes 
        stock_times = list(df_stocks[s].times)
        trends_times = list(df_temp_trends.index)
        joint_times = list(set(stock_times) & set(trends_times)) 
        
        print(joint_times)
        df_temp_stocks = df_stocks[s].loc[df_stocks[s]['times'].isin(joint_times)]
        df_temp_stocks = df_temp_stocks.reset_index()
        df_temp_stocks = df_temp_stocks.iloc[::-1]
        df_temp_trends = df_temp_trends.loc[df_temp_trends.index.isin(joint_times)]
        df_temp_trends = df_temp_trends.reset_index()
        df_temp_trends.columns = ['_'.join(x.split()) for x in list(df_temp_trends.columns) if len(x) > 1]
        df_trends_stocks[c] = df_temp_stocks.join(df_temp_trends)
cleanAndJoinData()

In [None]:
#sanity check that data is merged correctly 
df_trends_stocks['tesla'].head()

In [None]:
# Split Training and Testing Partitions
train_size = int(len(df_trends_stocks[list(df_trends_stocks.keys())[0]])*0.8)

In [None]:
# scale data to be between 0-1 
def predictCompany(company_name, train_size, cols):
    df_temp = df_trends_stocks[company_name]
    # average price at opening and closing 
    df_temp['mid'] = (df_temp['high'] - df_temp['low']) / 2
    # scale data to be between 0-1 including average
    sc = MinMaxScaler(feature_range = (0, 1))
    data_set_scaled = sc.fit_transform(df_temp[cols+['mid']])
    
    #split training data 
    train = data_set_scaled[:train_size, :]
    test = data_set_scaled[train_size:, :]
    train_X, train_y = train[:, :-1], train[:, -1]
    test_X, test_y = test[:, :-1], test[:, -1]
    train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    
    #create model
    model = Sequential()
    model.add(LSTM(train_size, input_shape=(train_X.shape[1], train_X.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    
    #train model
    history = model.fit(train_X, train_y, epochs=500, batch_size=72, validation_data=(test_X, test_y), verbose=0, shuffle=False)
    return history, model, train_X, train_y, test_X, test_y

In [None]:
# Plot MSE train/test
def plotHistory(history):
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='test')
    plt.legend()
    plt.show()

In [None]:
# Plot test and training set predictions over true data
def plotPrediction(company, train_size, model, train_X, train_y, test_X, test_y):
    plt.plot(range(len(train_y)+len(test_y)) , list(np.array(train_y))+list(np.array(test_y)), range(len(train_y)), model.predict(train_X), '-', range(len(train_y), len(train_y)+len(test_y)), model.predict(test_X), '-')
    plt.xlabel('Day')
    plt.ylabel('Profit from previous day')
    plt.title(company + ' - predicting stock market with Keras')
    plt.legend(["True Data","Training Data", "Testing Data"])
    plt.show()

In [None]:
# Plot Predictions for Each Company
def plotCompanies(companies, train_size, cols): 
    for company in companies: 
        # Fit model and predict for each company
        history, model, train_X, train_y, test_X, test_y = predictCompany(company, train_size, cols)
        # Plot Prediction Against True Profit for Each Company
        plotPrediction(company, train_size, model, train_X, train_y, test_X, test_y)
        # Plot Error for Each Company
        plotHistory(history)     

In [None]:
plotCompanies(companies, train_size, stock_columns)

In [None]:
# calculate mean squared error for each company 
MSE_train_stock_only = []
MSE_test_stock_only = []
def calculateMSE(companies, train_size, MSE_train, MSE_test, cols): 
    for company in companies: 
        history, model, train_X, train_y, test_X, test_y = predictCompany(company, train_size, cols)
        MSE_train_stock_only.append((company, mean_squared_error(model.predict(train_X), train_y)))
        MSE_test_stock_only.append((company, mean_squared_error(model.predict(test_X), test_y)))
calculateMSE(companies, train_size, MSE_train_stock_only, MSE_test_stock_only, stock_columns)

In [None]:
print(MSE_train_stock_only)
print(MSE_test_stock_only)

In [None]:
# calculate mean squared error for each company 
MSE_train_trends_only = []
MSE_test_trends_only = []
for company in companies: 
    print(company)
    cols = [x.replace(" ", "_") for x in kw_list if company in x]
    history, model, train_X, train_y, test_X, test_y = predictCompany(company, train_size, cols)
    MSE_train_trends_only.append((company, mean_squared_error(model.predict(train_X), train_y)))
    MSE_test_trends_only.append((company, mean_squared_error(model.predict(test_X), test_y)))

In [None]:
print(MSE_train_trends_only)
print(MSE_test_trends_only)

In [None]:
# calculate mean squared error for each company 
MSE_train_both = []
MSE_test_both = []
for company in companies: 
    print(company)
    cols = [x.replace(" ", "_") for x in kw_list if company in x] + stock_columns
    history, model, train_X, train_y, test_X, test_y = predictCompany(company, train_size, cols)
    MSE_train_both.append((company, mean_squared_error(model.predict(train_X), train_y)))
    MSE_test_both.append((company, mean_squared_error(model.predict(test_X), test_y)))

In [None]:
print(MSE_train_both)
print(MSE_test_both)