In [1]:
# Import and clean data
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

d = {'M':'E6', 'B':'E9', 'T':'E12', '%': 'E-2'}

# Process all the various datasets
monthlyGDP = pd.read_csv("data/processed/monthly_gdp.csv")
monthlyGDP['Date'] = pd.to_datetime(monthlyGDP['Date']).dt.to_period('M')
monthlyGDP['GDP'] = monthlyGDP['GDP'].replace(d, regex=True).astype(float)

monthlyInflation = pd.read_csv("data/processed/monthly_inflation.csv")
monthlyInflation['Date'] = pd.to_datetime(monthlyInflation['Date']).dt.to_period('M')
monthlyInflation['Inflation'] = monthlyInflation['Inflation'].replace(d, regex=True).astype(float)

monthlyUnemployment = pd.read_csv("data/processed/monthly_unemp.csv")
monthlyUnemployment['Date'] = pd.to_datetime(monthlyUnemployment['Date']).dt.to_period('M')
monthlyUnemployment['Unemployment'] = monthlyUnemployment['Unemployment'].replace(
    d, regex=True).astype(float)

dailySP = pd.read_csv("data/processed/daily_SP500.csv", thousands=",")
dailySP[["Open", "High", "Low", "Close", "AdjClose", "Volume"]
        ] = dailySP[["Open", "High", "Low", "Close", "AdjClose", "Volume"]].apply(pd.to_numeric)
dailySP['Returns'] = dailySP['AdjClose'] - dailySP['Open']
monthlySP = dailySP.groupby(pd.PeriodIndex(dailySP['Date'], freq="M"))['Returns'].mean()

dailyVolatility = pd.read_csv("data/processed/daily_volatility.csv")
dailyVolatility[["Volatility"]] = dailyVolatility[["Volatility"]].apply(pd.to_numeric)
monthlyVol = dailyVolatility.groupby(pd.PeriodIndex(
    dailyVolatility['Date'], freq="M"))['Volatility'].mean()

# Merge 
monthlyDF = pd.merge(monthlyGDP, monthlyInflation, how='inner', on='Date')
monthlyDF = pd.merge(monthlyDF, monthlyUnemployment, how='inner', on='Date')
monthlyDF = pd.merge(monthlyDF, monthlySP, how='inner', on='Date')
monthlyDF = pd.merge(monthlyDF, monthlyVol, how='inner', on='Date')
monthlyDF = monthlyDF.drop_duplicates().reset_index()
monthlyDF["Months Since"] = monthlyDF.index
monthlyDF["GDP Delta"] = monthlyDF.GDP.pct_change(periods = 1) * 100
monthlyDF["Inflation Delta"] = monthlyDF.Inflation.diff() * 100
monthlyDF["Unemployment Delta"] = monthlyDF.Unemployment.diff() * 100
monthlyDF["Returns MA"] = monthlyDF["Returns"].rolling(window=5).mean()
monthlyDF = monthlyDF.dropna()

# Keeping variable space clean
del d
del monthlyGDP
del monthlyInflation
del monthlyUnemployment
del dailySP
del monthlySP
del dailyVolatility
del monthlyVol

# Print final result
monthlyDF


Unnamed: 0,index,Date,GDP,Inflation,Unemployment,Returns,Volatility,Months Since,GDP Delta,Inflation Delta,Unemployment Delta,Returns MA
4,4,1999-04,9.495000e+12,0.021,0.043,2.322857,23.478571,4,0.232239,0.7,0.1,1.557900
5,5,1999-05,9.534000e+12,0.018,0.042,-1.667000,26.204500,5,0.410742,-0.3,-0.1,0.644700
6,6,1999-06,9.543000e+12,0.018,0.043,3.221818,23.626364,6,0.094399,0.0,0.1,0.758432
7,7,1999-07,9.654000e+12,0.020,0.043,-2.094286,21.049048,7,1.163156,0.2,0.0,0.774417
8,8,1999-08,9.655000e+12,0.022,0.042,-0.377727,24.323636,8,0.010358,0.2,-0.1,0.281132
...,...,...,...,...,...,...,...,...,...,...,...,...
270,290,2021-06,2.290000e+13,0.064,0.059,1.512727,16.956818,270,0.792254,0.4,0.1,3.546264
271,292,2021-07,2.294000e+13,0.063,0.054,5.711905,17.603333,271,0.174672,-0.1,-0.5,3.760855
272,294,2021-08,2.325000e+13,0.061,0.052,5.131818,17.472727,272,1.351351,-0.2,-0.2,4.022523
273,296,2021-09,2.341000e+13,0.062,0.048,-11.691429,19.824762,273,0.688172,0.1,-0.4,0.236904


In [2]:
# Optimizer

# IO
import os
import pandas as pd
import numpy as np
import pickle
from datetime import date

# Models
from sklearn.linear_model import LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

class Optimizer():
    """
    Class which tries various regressor models to pick optimal one for the dataset
    Outputs performance of each, and optionally saves them to Pickle
    """

    def __init__(self):
        self._models = {
            "Linear": LinearRegression(),
            "SGD": SGDRegressor(),
            "Elastic": ElasticNet(),
            "Bayesian": BayesianRidge(),
            "Kernel": KernelRidge(),
            "Gradient": GradientBoostingRegressor(),
            "SVR": SVR()
        }

        self._model_types = list(self._models.keys())
        self.scores = {}
        self.mse = {}
        for model in self._model_types:
            self.scores[model] = 0
            self.mse[model] = 0

    def train_model(self, model: str, x: pd.DataFrame, y: pd.DataFrame,
    split=True, x_test_in=None, y_test_in=None, print_results=True):
        """ Trains model to the provided data """
        model_mixin = self._models[model]

        x_train, y_train, x_test, y_test = x, y, x_test_in, y_test_in
        if split:
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
        model_mixin.fit(x_train, y_train)
        y_pred = model_mixin.predict(x_test)

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        self.mse[model] = rmse
        score = model_mixin.score(x_test, y_test)
        self.scores[model] = score
        if print_results:
            print(f"{model} model had RMSE of {rmse} and R2 score of {score}")

    def train_all_models(self, x: pd.DataFrame, y: pd.DataFrame, 
    split=True, x_test_in=None, y_test_in=None, print_results=True):
        """ Wrapper around train_model, calls on all models """
        for model in self._model_types:
            self.train_model(model, x, y, split, x_test_in, y_test_in, print_results)
        

    def save_model(self, model: str, filename = "") -> bool:
        """
        Saves the model passed to function using Pickle
        Returns Bool indicating success (only fails rn if invalid model passed)
        """
        if model in self._model_types:
            if len(filename) == 0:
                filename = "exports{}{}_{}.sav".format(os.sep, model, str(date.today()))
            else:
                filename = "exports{}{}.sav".format(os.sep, filename)
            
            pickle.dump(self._models[model], open(filename, 'wb'))
            return True
        else:
            return False
    
    def save_all_models(self) -> None:
        """ Wrapper around save model, calls on all models """
        for model in self._model_types:
            self.save_model(model)
    
    def load_model(self, model: str, filename: str) -> bool:
        """
        Attempts to load the model from exports given a model and date
        Returns bool indicating success or not (file not found errors, for example)
        """
        if model not in self._model_types:
            return False
        if not os.path.isfile(filename):
            return False

        self._models[model] = pickle.load(open(filename, 'rb'))
        return True
    
    def load_all_models(self, date: date) -> bool:
        """ 
        Wrapper around load model, calls on all models
        Returns False if ANY loads fail
        """
        ret = True
        for model in self._model_types:
            model_ret = self.load_model(model, date)
            if not model_ret:
                ret = False
        return ret


opt = Optimizer()
print("Trainer object loaded")


Trainer object loaded


In [3]:
# Predicting GDP with just date
opt.train_all_models(monthlyDF[['Months Since']], monthlyDF['GDP'])


Linear model had RMSE of 442472597512.37964 and R2 score of 0.9846143001814067
SGD model had RMSE of 22628199363351.59 and R2 score of -36.670692778662435
Elastic model had RMSE of 463640007819.87946 and R2 score of 0.9839089393439351
Bayesian model had RMSE of 3503134282810.1743 and R2 score of -0.0029201998391306105
Kernel model had RMSE of 3733974856665.251 and R2 score of -0.07591635531545449
Gradient model had RMSE of 318407380395.6698 and R2 score of 0.9927481296866789
SVR model had RMSE of 3495756538870.017 and R2 score of -0.0020726728784787873


In [4]:
# Predicting % Changes in GDP, using sampling
num_items = len(monthlyDF.index)
train = monthlyDF.sample(frac = 0.75).sample(n = 2000, replace = True)
test = monthlyDF.drop(train.index)

x_vars = [
    'Returns',
    'Inflation',
    'Inflation Delta',
    'Unemployment',
    'Unemployment Delta',
    'Volatility'
]
opt.train_all_models(
    train[x_vars],
    train['GDP Delta'],
    split=False,
    x_test_in=test[x_vars],
    y_test_in=test['GDP Delta']
)

# Clean variable space
del num_items
del train
del test
del x_vars

Linear model had RMSE of 0.6424533920076545 and R2 score of -0.22308811740136947
SGD model had RMSE of 0.9924302789002051 and R2 score of -1.91859951639012
Elastic model had RMSE of 0.565050851575356 and R2 score of 0.05387272702119683
Bayesian model had RMSE of 0.6397348925104708 and R2 score of -0.2127591821394188
Kernel model had RMSE of 0.6364767474149433 and R2 score of -0.20043756766195697
Gradient model had RMSE of 0.676280010352666 and R2 score of -0.3552755245841699
SVR model had RMSE of 0.5981369753611013 and R2 score of -0.060170693010501486


In [5]:
# Creating models for indicators of the 08 recession
mask = (monthlyDF['Months Since'] > 118) & (monthlyDF['Months Since'] < 142)
housing_recession = monthlyDF.loc[mask]
housing_recession['Months Since'] -= 119

for indicator in ['Unemployment', 'Volatility', 'Inflation', 'Returns MA']:
    while True:
        train = housing_recession.sample(frac = 0.66).sample(n = 500, replace = True)
        test = housing_recession.drop(train.index)

        opt.train_all_models(
            train[['Months Since']],
            train[indicator],
            split=False,
            x_test_in=test[['Months Since']],
            y_test_in=test[indicator],
            print_results=False
        )

        if opt.scores['Gradient'] > 0.85:
            filename = f"Gradient-{indicator}"
            opt.save_model('Gradient', filename)
            break
    
    print(f"\n===== {indicator} =====")
    for model in opt.scores:
        print(f"{model} has a rmse of: {opt.mse[model]} and score of: {opt.scores[model]}")

# Clean variable space
del mask


===== Unemployment =====
Linear has a rmse of: 0.006333203786434125 and score of: -0.028447953859064734
SGD has a rmse of: 0.010941468820654792 and score of: -2.069634357778485
Elastic has a rmse of: 0.006642036133596385 and score of: -0.1311960000000001
Bayesian has a rmse of: 0.006330543257425171 and score of: -0.027584049593109894
Kernel has a rmse of: 0.03969515056393882 and score of: -39.40269175112235
Gradient has a rmse of: 0.0022583248431083188 and score of: 0.8692299718717895
SVR has a rmse of: 0.01136881700090207 and score of: -2.3141025641025617

===== Volatility =====
Linear has a rmse of: 7.350393435258198 and score of: 0.5419855152109003
SGD has a rmse of: 6.1215504082997185 and score of: 0.6823265630642106
Elastic has a rmse of: 7.377261897522324 and score of: 0.5386309636996842
Bayesian has a rmse of: 7.352499891987918 and score of: 0.5417229643840711
Kernel has a rmse of: 24.027469779254623 and score of: -3.8941144470734192
Gradient has a rmse of: 3.892434346108393 an

In [6]:
# Using 08 models and applying to economic data during COVID
import pickle

mask = (monthlyDF['Months Since'] > 254)
covid_recession = monthlyDF.loc[mask]
covid_recession['Months Since'] -= 255
x = covid_recession[['Months Since']]

normalMean = {}
normalRange = {}
for indicator in ['Inflation', 'Volatility', 'Unemployment', 'Returns MA']:
    normalMean[indicator] = housing_recession[indicator].mean() - covid_recession[indicator].mean()
    housing_range = housing_recession[indicator].max() - housing_recession[indicator].min()
    covid_range = covid_recession[indicator].max() - covid_recession[indicator].min()
    normalRange[indicator] = housing_range / covid_range

for ind in normalMean:
    gr = pickle.load(open(f'exports\Gradient-{ind}.sav', 'rb'))
    y = covid_recession[ind] + normalMean[ind]
    y *= normalRange[ind]

    y_pred = gr.predict(x)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    score = gr.score(x, y)

    print(f"{ind} during Covid was estimated with a RMSE of: {rmse} and score of: {score}")

    predictions = pd.DataFrame(list(zip(y_pred, y)), columns = ['Prediction', 'Actual'])
    predictions.to_csv(f"ml{os.sep}predictions{os.sep}{ind}.csv")

# Clean variable space
del mask
del x
del covid_recession
del normalMean
del normalRange


Inflation during Covid was estimated with a RMSE of: 0.01172606292652328 and score of: 0.7221861626472037
Volatility during Covid was estimated with a RMSE of: 5.926270473110732 and score of: 0.683112831636121
Unemployment during Covid was estimated with a RMSE of: 0.06513765450866668 and score of: -52.702418466249824
Returns MA during Covid was estimated with a RMSE of: 2.114628481838211 and score of: -0.04027288487250935


In [7]:
# Reading and cleaning COVID related data
covidData = pd.read_csv("data/owid-covid-data.csv")
covidData.drop(covidData[covidData.location != 'United States'].index, inplace=True)
covidData = covidData[['date','location', 'total_cases', 'new_cases', 'new_cases_smoothed_per_million', 'new_cases_per_million', 'total_deaths', 'new_deaths', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'stringency_index']]
covidData['date'] = pd.to_datetime(covidData['date']).dt.to_period('M')
colname = covidData.columns
covidData = covidData.groupby(pd.PeriodIndex(covidData['date'], freq="M"))[colname].mean()
covidData = covidData.reset_index()
covidData.rename(columns={'date': 'Date'}, inplace=True)
covidData = covidData.drop(['total_cases', 'total_deaths', 'new_cases', 'new_cases_per_million', 'total_deaths', 'new_deaths', 'new_deaths_per_million'], axis=1)

covidData

Unnamed: 0,Date,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,stringency_index
0,2020-01,0.00225,0.0,0.0
1,2020-02,0.001345,0.0,5.368276
2,2020-03,12.310871,0.300129,45.774516
3,2020-04,87.490933,5.697667,72.69
4,2020-05,71.002516,4.344645,72.69
5,2020-06,77.929967,2.115467,70.711333
6,2020-07,179.905613,2.343548,68.263871
7,2020-08,149.146387,3.00671,67.13
8,2020-09,120.874,2.359633,64.197667
9,2020-10,173.267032,2.308387,64.051613


In [8]:
# Merging covidData to monthlyDF
monthlyDF = pd.merge(monthlyDF, covidData, how='outer', on='Date')
for items in ['new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 'stringency_index']:
    monthlyDF[items] = monthlyDF[items].fillna(0)

# Clean variable space
del covidData

# Print final output
monthlyDF

Unnamed: 0,index,Date,GDP,Inflation,Unemployment,Returns,Volatility,Months Since,GDP Delta,Inflation Delta,Unemployment Delta,Returns MA,new_cases_smoothed_per_million,new_deaths_smoothed_per_million,stringency_index
0,4.0,1999-04,9.495000e+12,0.021,0.043,2.322857,23.478571,4.0,0.232239,0.7,0.1,1.557900,0.000000,0.000000,0.000000
1,5.0,1999-05,9.534000e+12,0.018,0.042,-1.667000,26.204500,5.0,0.410742,-0.3,-0.1,0.644700,0.000000,0.000000,0.000000
2,6.0,1999-06,9.543000e+12,0.018,0.043,3.221818,23.626364,6.0,0.094399,0.0,0.1,0.758432,0.000000,0.000000,0.000000
3,7.0,1999-07,9.654000e+12,0.020,0.043,-2.094286,21.049048,7.0,1.163156,0.2,0.0,0.774417,0.000000,0.000000,0.000000
4,8.0,1999-08,9.655000e+12,0.022,0.042,-0.377727,24.323636,8.0,0.010358,0.2,-0.1,0.281132,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,294.0,2021-08,2.325000e+13,0.061,0.052,5.131818,17.472727,272.0,1.351351,-0.2,-0.2,4.022523,395.391903,2.467355,54.258065
269,296.0,2021-09,2.341000e+13,0.062,0.048,-11.691429,19.824762,273.0,0.688172,0.1,-0.4,0.236904,429.353633,5.549300,54.536000
270,298.0,2021-10,2.391000e+13,0.073,0.046,10.991429,17.871429,274.0,2.135839,1.1,-0.2,2.331290,258.634516,5.004968,55.123226
271,,2021-11,,,,,,,,,,,243.203767,3.344367,46.575000


In [9]:
# Creating models for indicators of the covid recession
mask = (monthlyDF['Months Since'] > 251)
covid_deaths = monthlyDF.loc[mask]
covid_deaths['Months Since'] -= 255

for indicator in ['Inflation', 'Volatility', 'Unemployment', 'Returns MA']:
    while True:
        train = covid_deaths.sample(frac = 0.66).sample(n = 500, replace = True)
        test = covid_deaths.drop(train.index)

        opt.train_all_models(
            train[['Months Since', 'new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 'stringency_index']],
            train[indicator],
            split=False,
            x_test_in=test[['Months Since', 'new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 'stringency_index']],
            y_test_in=test[indicator],
            print_results=False
        )

        if opt.scores['Gradient'] > 0.85:
            filename = f"Gradient_cov-{indicator}"
            opt.save_model('Gradient', filename)
            break
    
    print(f"\n===== {indicator} =====")
    for model in opt.scores:
        print(f"{model} has a rmse of: {opt.mse[model]} and score of: {opt.scores[model]}")

# Clean variable space
del mask


===== Inflation =====
Linear has a rmse of: 0.0067209719509177205 and score of: 0.9357618501963947
SGD has a rmse of: 14256484491318.508 and score of: -2.8903720565454464e+29
Elastic has a rmse of: 0.027004790982342375 and score of: -0.037075795573726866
Bayesian has a rmse of: 0.006713594411571178 and score of: 0.9359027998605212
Kernel has a rmse of: 0.014117977099707897 and score of: 0.716551734227533
Gradient has a rmse of: 0.005234241070948207 and score of: 0.9610384433898483
SVR has a rmse of: 0.026716100014785093 and score of: -0.015020887032263674

===== Volatility =====
Linear has a rmse of: 5.002213582928418 and score of: 0.5916303518215242
SGD has a rmse of: 150216864593642.28 and score of: -3.682700319209593e+26
Elastic has a rmse of: 4.744918776761355 and score of: 0.6325598930421664
Bayesian has a rmse of: 4.913375586675359 and score of: 0.6060066235387727
Kernel has a rmse of: 7.060970250649155 and score of: 0.18631228878300565
Gradient has a rmse of: 2.633639363911058 

In [10]:
# Using covid models and applying to economic data during 08 recession
mask = (monthlyDF['Months Since'] > 118) & (monthlyDF['Months Since'] < 142)
housing_recession = monthlyDF.loc[mask]
housing_recession['Months Since'] -= 119
x = housing_recession[['Months Since', 'new_cases_smoothed_per_million', 'new_deaths_smoothed_per_million', 'stringency_index']]

normalMean = {}
normalRange = {}
for indicator in ['Inflation', 'Volatility', 'Unemployment', 'Returns MA']:
    normalMean[indicator] = covid_deaths[indicator].mean() - housing_recession[indicator].mean()
    housing_range = housing_recession[indicator].max() - housing_recession[indicator].min()
    covid_range = covid_deaths[indicator].max() - covid_deaths[indicator].min()
    normalRange[indicator] = covid_range / housing_range

for ind in normalMean:
    gr = pickle.load(open(f'exports\Gradient_cov-{ind}.sav', 'rb'))
    y = housing_recession[ind] + normalMean[ind]
    y *= normalRange[ind]

    y_pred = gr.predict(x)
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    score = gr.score(x, y)

    print(f"{ind} during recession of 08 mapped from Covid was estimated with a RMSE of: {rmse} and score of: {score}")

    predictions = pd.DataFrame(list(zip(y_pred, y)), columns = ['Prediction', 'Actual'])
    predictions.to_csv(f"ml{os.sep}predictions{os.sep}{ind}cov.csv")

# Clean variable space
del mask
del x
del covid_deaths
del normalMean
del normalRange


Inflation during recession of 08 mapped from Covid was estimated with a RMSE of: 0.011492602238661402 and score of: 0.8140525676272509
Volatility during recession of 08 mapped from Covid was estimated with a RMSE of: 6.68696637508655 and score of: 0.6452365981715584
Unemployment during recession of 08 mapped from Covid was estimated with a RMSE of: 0.19529939600995974 and score of: -42.03702341091725
Returns MA during recession of 08 mapped from Covid was estimated with a RMSE of: 2.035916254406029 and score of: -0.7233999804986149
