In [31]:
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.metrics import accuracy_score

In [749]:
# read in interest rate csv file
interest = pd.read_csv("Clean_Data/clean_fedfunds.csv")
# change data types from int to string
interest["Month"] = interest["Month"].astype(str)
interest["Year"] = interest["Year"].astype(str)
# filter from 2002
interest = interest.loc[interest["Year"] >= "2002"]
interest.head()

Unnamed: 0,Year,Month,FEDFUNDS
570,2002,1,1.73
571,2002,2,1.74
572,2002,3,1.73
573,2002,4,1.75
574,2002,5,1.75


In [750]:
# read in industrial output csv file
indus = pd.read_csv("Clean_Data/Industrial.csv")
#change data types from int to string
indus["Month"] = indus["Month"].astype(str)
indus["Year"] = indus["Year"].astype(str)
indus.head()

Unnamed: 0,DATE,INDPRO,Month,Year
0,2002-01-01,88.6706,1,2002
1,2002-02-01,88.6723,2,2002
2,2002-03-01,89.3624,3,2002
3,2002-04-01,89.7933,4,2002
4,2002-05-01,90.1629,5,2002


In [751]:
# read in Retail Sales data 
sales = pd.read_csv("Clean_Data/Retail_Sales.csv")
# change out int to string
sales["Month"] = sales["Month"].astype(str)
sales["Year"] = sales["Year"].astype(str)
sales.head()

Unnamed: 0,DATE,RSXFS,Year,Month
0,2002-01-01,256307,2002,1
1,2002-02-01,257670,2002,2
2,2002-03-01,257059,2002,3
3,2002-04-01,261333,2002,4
4,2002-05-01,257573,2002,5


In [752]:
# read in inflation data
inflation = pd.read_csv("Clean_Data/inflation.csv")
#change to string
inflation["Year"] = inflation["Year"].astype(str)
inflation.head()

Unnamed: 0,Year,1,2,3,4,5,6,7,8,9,10,11,12
0,2002,2.6,2.6,2.4,2.5,2.5,2.3,2.2,2.4,2.2,2.2,2.0,1.9
1,2003,1.9,1.7,1.7,1.5,1.6,1.5,1.5,1.3,1.2,1.3,1.1,1.1
2,2004,1.1,1.2,1.6,1.8,1.7,1.9,1.8,1.7,2.0,2.0,2.2,2.2
3,2005,2.3,2.4,2.3,2.2,2.2,2.0,2.1,2.1,2.0,2.1,2.1,2.2
4,2006,2.1,2.1,2.1,2.3,2.4,2.6,2.7,2.8,2.9,2.7,2.6,2.6


In [753]:
# read in unemployment data
unemployment = pd.read_csv("Clean_Data/Unemployment.csv")
#change int to string
unemployment["Year"] = unemployment["Year"].astype(str)
unemployment.head()

Unnamed: 0,Year,1,2,3,4,5,6,7,8,9,10,11,12
0,2002,5.7,5.7,5.7,5.9,5.8,5.8,5.8,5.7,5.7,5.7,5.9,6.0
1,2003,5.8,5.9,5.9,6.0,6.1,6.3,6.2,6.1,6.1,6.0,5.8,5.7
2,2004,5.7,5.6,5.8,5.6,5.6,5.6,5.5,5.4,5.4,5.5,5.4,5.4
3,2005,5.3,5.4,5.2,5.2,5.1,5.0,5.0,4.9,5.0,5.0,5.0,4.9
4,2006,4.7,4.8,4.7,4.7,4.6,4.6,4.7,4.7,4.5,4.4,4.5,4.4


In [963]:
# create empty dataframe
data = pd.DataFrame()
ticker_list = ["AAPL", "NVDA", "AMZN", "TSLA", "MSFT", "GOOG", "META"]
#read in tickers
for i in ticker_list:
    temp = pd.read_csv(f"Extracting_Data/{i}/{i}_monthly_1_False.csv")
    # add ticker name to each row
    temp["Ticker"] = i
    # combines each ticker csv on top of eachother
    data = pd.concat([data, temp])
data.head()

Unnamed: 0,Datetime,Open,Close,High,Low,Volume,Gain%,Ticker
0,2002-06-30 22:00:00,0.31625,0.2725,0.335536,0.246429,8118415200,-13.833992,AAPL
1,2002-07-31 22:00:00,0.269821,0.263393,0.290179,0.249464,5151686400,-2.382529,AAPL
2,2002-08-31 22:00:00,0.25875,0.258929,0.27125,0.250893,5291837600,0.069014,AAPL
3,2002-09-30 22:00:00,0.260536,0.286964,0.293571,0.238571,6611432800,10.143926,AAPL
4,2002-10-31 22:00:00,0.284643,0.276786,0.310357,0.268036,4531721600,-2.760345,AAPL


In [973]:
data["Datetime"].value_counts()

2012-06-30 22:00:00    7
2014-04-30 22:00:00    7
2014-06-30 22:00:00    7
2014-07-31 22:00:00    7
2014-08-31 22:00:00    7
                      ..
2004-02-29 22:00:00    4
2004-03-31 22:00:00    4
2004-04-30 22:00:00    4
2004-05-31 22:00:00    4
2002-06-30 22:00:00    3
Name: Datetime, Length: 239, dtype: int64

In [964]:
#add a future column where shows the gain and add column to see where tickers overlap
data['future'] = data["Gain%"].shift(-1)
data["Mov_Ticker"] = data["Ticker"].shift(-1)
data = data.dropna()

#drop rows where ticker info overlaps
data = data.loc[data["Mov_Ticker"] == data["Ticker"]]
data.head()

Unnamed: 0,Datetime,Open,Close,High,Low,Volume,Gain%,Ticker,future,Mov_Ticker
0,2002-06-30 22:00:00,0.31625,0.2725,0.335536,0.246429,8118415200,-13.833992,AAPL,-2.382529,AAPL
1,2002-07-31 22:00:00,0.269821,0.263393,0.290179,0.249464,5151686400,-2.382529,AAPL,0.069014,AAPL
2,2002-08-31 22:00:00,0.25875,0.258929,0.27125,0.250893,5291837600,0.069014,AAPL,10.143926,AAPL
3,2002-09-30 22:00:00,0.260536,0.286964,0.293571,0.238571,6611432800,10.143926,AAPL,-2.760345,AAPL
4,2002-10-31 22:00:00,0.284643,0.276786,0.310357,0.268036,4531721600,-2.760345,AAPL,-9.874217,AAPL


In [965]:
def data_transform(data):
    #create arrays
    inflation_stat = []
    unemployment_stat = []
    #iterate through each datetime in the dataset
    for i in data["Datetime"]:
        #calculates if the month is formatted as 0# and assigns month to number #
        if i[5] == "0":
            month = i[6]
        # if formatted as ## then month assigned this number ##
        else:
            month = i[5:7]

        # locates in inflation/unemployment data when the year is the same as the current itteration
        inf_month = inflation.loc[inflation["Year"] == [i][0][:4]]
        unemp_month = unemployment.loc[unemployment["Year"] == [i][0][:4]]
        
        #locates the month of the filtered data and extracts the value
        inflation_stat.append(inf_month[month].values[0])
        unemployment_stat.append(unemp_month[month].values[0])

    # adds arrays to dataset
    data["Inflation"] = inflation_stat
    data["Unemployment"] = unemployment_stat
    # eliminates month from data
    data = data.loc[data["Datetime"] < "2022-06"]
    
    # create arrays
    industrial_stat = []
    interest_stat = []
    sales_stat = []
    #iterate through each datetime in the dataset
    for i in data["Datetime"]:
        #calculates if the month is formatted as 0# and assigns month to number #
        if i[5] == "0":
            month = i[6]
        # if formatted as ## then month assigned this number ##
        else:
            month = i[5:6]
        
        # filters datasets with the year in the iteration and when the month of the dataset is = "month"
        row = interest.loc[(interest["Year"] == [i][0][:4]) & (interest["Month"] == month)]
        ind = indus.loc[(indus["Year"] == [i][0][:4]) & (indus["Month"] == month)]
        sal = sales.loc[(sales["Year"] == [i][0][:4]) & (sales["Month"] == month)]
        
        # append stats to arrays
        interest_stat.append(row["FEDFUNDS"].values[0])
        industrial_stat.append(ind["INDPRO"].values[0])
        sales_stat.append(sal["RSXFS"].values[0])
    # adds arrays to dataset
    data["Industrial"] = industrial_stat
    data["Interest"] = interest_stat
    data["Retail_Sales"] = sales_stat
    
    # add new columns with percent changed
    data["Interest_pct"] = data["Interest"].pct_change()
    data["Inflation_pct"] = data["Inflation"].pct_change()
    data["Unemployment_pct"] = data["Unemployment"].pct_change()
    data["Industrial_pct"] = data["Industrial"].pct_change()
    data["Volume_pct"] = data["Volume"].pct_change()
    data["Retail_Sales_pct"] = data["Retail_Sales"].pct_change() 
    data = data.dropna()

    # calculates if the future gain is positive or negative
    R_g = []
    for x in data["future"]:
        if x <= 0:
            R_g.append(0)
        if x > 0:
            R_g.append(1)
    #add column for calculation
    data["R/G"] = R_g
    return data


In [966]:
# call function
data = data_transform(data)
data.head()

Unnamed: 0,Datetime,Open,Close,High,Low,Volume,Gain%,Ticker,future,Mov_Ticker,...,Industrial,Interest,Retail_Sales,Interest_pct,Inflation_pct,Unemployment_pct,Industrial_pct,Volume_pct,Retail_Sales_pct,R/G
1,2002-07-31 22:00:00,0.269821,0.263393,0.290179,0.249464,5151686400,-2.382529,AAPL,0.069014,AAPL,...,90.8761,1.73,262769,-0.011429,-0.043478,0.0,-0.000349,-0.365432,0.011483,1
2,2002-08-31 22:00:00,0.25875,0.258929,0.27125,0.250893,5291837600,0.069014,AAPL,10.143926,AAPL,...,90.785,1.74,265043,0.00578,0.090909,-0.017241,-0.001002,0.027205,0.008654,1
3,2002-09-30 22:00:00,0.260536,0.286964,0.293571,0.238571,6611432800,10.143926,AAPL,-2.760345,AAPL,...,90.8931,1.75,260626,0.005747,-0.083333,0.0,0.001191,0.249364,-0.016665,0
4,2002-10-31 22:00:00,0.284643,0.276786,0.310357,0.268036,4531721600,-2.760345,AAPL,-9.874217,AAPL,...,88.6706,1.73,256307,-0.011429,0.0,0.0,-0.024452,-0.314563,-0.016572,0
5,2002-11-30 22:00:00,0.283929,0.255893,0.2875,0.246071,4530439200,-9.874217,AAPL,0.0,AAPL,...,88.6706,1.73,256307,0.0,-0.090909,0.035088,0.0,-0.000283,0.0,0


In [967]:
#index to test/predict for
tester = pd.read_csv("Extracting_Data/QQQ/QQQ_monthly_1_False.csv")
tester['future'] = tester["Gain%"].shift(-1)
tester = tester.dropna()
tester.head()

Unnamed: 0,Datetime,Open,Close,High,Low,Volume,Gain%
0,2002-06-30 22:00:00,25.97,23.85,26.55,21.64,2668049100,-8.163265
1,2002-07-31 22:00:00,23.74,23.49,26.21,21.3,2043810216,-1.053075
2,2002-08-31 22:00:00,23.06,20.72,24.35,20.49,1669047392,-10.147441
3,2002-09-30 22:00:00,20.91,24.55,25.04,19.76,2129150120,17.407939
4,2002-10-31 22:00:00,24.37,27.72,28.29,24.14,1669954692,13.74641


In [971]:
def test(tester):
    inflation_stat = []
    unemployment_stat = []
    for i in tester["Datetime"]:
        if i[5] == "0":
            month = i[6]
        else:
            month = i[5:7]
        inf_month = inflation.loc[inflation["Year"] == [i][0][:4]]
        unemp_month = unemployment.loc[unemployment["Year"] == [i][0][:4]]
        inflation_stat.append(inf_month[month].values[0])
        unemployment_stat.append(unemp_month[month].values[0])
    tester["Inflation"] = inflation_stat
    tester["Unemployment"] = unemployment_stat
    industrial_stat = []
    interest_stat = []
    sales_stat = []
    for i in tester["Datetime"]:
        if i[5] == "0":
            month = i[6]
        else:
            month = i[5:6]
        row = interest.loc[(interest["Year"] == [i][0][:4]) & (interest["Month"] == month)]
        ind = indus.loc[(indus["Year"] == [i][0][:4]) & (indus["Month"] == month)]
        sal = sales.loc[(sales["Year"] == [i][0][:4]) & (sales["Month"] == month)]
        interest_stat.append(row["FEDFUNDS"].values[0])
        industrial_stat.append(ind["INDPRO"].values[0])
        sales_stat.append(sal["RSXFS"].values[0])
    tester["Industrial"] = industrial_stat
    tester["Interest"] = interest_stat
    tester["Retail_Sales"] = sales_stat
    tester["Interest_pct"] = tester["Interest"].pct_change()
    tester["Inflation_pct"] = tester["Inflation"].pct_change()
    tester["Unemployment_pct"] = tester["Unemployment"].pct_change()
    tester["Industrial_pct"] = tester["Industrial"].pct_change()
    tester["Volume_pct"] = tester["Volume"].pct_change()
    tester["Retail_Sales_pct"] = tester["Retail_Sales"].pct_change()
    tester['future'] = tester["Gain%"].shift(-1)
    tester = tester.dropna()
    R_g = []
    for x in tester["future"]:
        if x <= 0:
            R_g.append(0)
        if x > 0:
            R_g.append(1)
    tester["R/G"] = R_g

    tester.head()

In [969]:
X = data[[ "Interest_pct", "Volume_pct","Volume","Unemployment_pct", "Industrial_pct", "Retail_Sales_pct", "Inflation", "Interest", "Industrial", "Retail_Sales", "Unemployment"]]
y = data["R/G"]

In [970]:
TestX = tester[[ "Interest_pct", "Volume_pct", "Volume", "Unemployment_pct", "Industrial_pct", "Retail_Sales_pct", "Inflation", "Interest", "Industrial", "Retail_Sales", "Unemployment"]]
Testy = tester["R/G"]

KeyError: "['Interest_pct', 'Volume_pct', 'Unemployment_pct', 'Industrial_pct', 'Retail_Sales_pct', 'Inflation', 'Interest', 'Industrial', 'Retail_Sales', 'Unemployment'] not in index"

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# Standardize data set
scaler = StandardScaler()
# Fit X_train data
scaler.fit(X_train)
#Transform scaled data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)

accuracy_score(y_test, y_pred)

In [None]:
scaled = scaler.transform(TestX)
# Evaluate the model
y_pred = rf_model.predict(scaled)

accuracy_score(Testy, y_pred)

In [787]:

# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.19404201361880488, 'Volume'),
 (0.18626943475114147, 'Volume_pct'),
 (0.07865473579485405, 'Industrial_pct'),
 (0.07736034107012428, 'Unemployment_pct'),
 (0.07712739155476805, 'Industrial'),
 (0.0695790025879238, 'Retail_Sales'),
 (0.06744605135116773, 'Retail_Sales_pct'),
 (0.06581555986877667, 'Unemployment'),
 (0.06271934942912445, 'Inflation'),
 (0.06105833649323741, 'Interest'),
 (0.059927783480077074, 'Interest_pct')]

In [None]:
data.csv("Tech_Stocks.csv", index = F)