In [20]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import r2_score
from tqdm import tqdm
from statistics import mean, stdev

In [71]:
def normalization(xTrain, xTest):
    std_scale = preprocessing.StandardScaler(with_mean=False)
    cols = list(xTrain.columns)
    xTrain = std_scale.fit_transform(xTrain)
    xTest = std_scale.transform(xTest)
    xTrain = pd.DataFrame(xTrain, columns=cols)
    xTest = pd.DataFrame(xTest, columns=cols)
    return xTrain, xTest

def extract_features(df):
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%d %H:%M:%S")
    df.insert(loc=0, column='dt_year', value=df['datetime'].dt.year)
    df.insert(loc=0, column='dt_month', value=df['datetime'].dt.month)
    df.insert(loc=0, column='dt_day', value=df['datetime'].dt.day)
    df.insert(loc=0, column='dt_dayofweek', value=df['datetime'].dt.dayofweek)
    df.insert(loc=0, column='dt_hour', value=df['datetime'].dt.hour)
    df = df.drop(columns=['datetime'])
    return df

# use pearson correlation to remove redundant features and features with nan correlation to stock price change
def pearson_graph(dfx, dfy):
    matrix = dfx.to_numpy()
    matrix = np.hstack((dfy['stock_change'].to_numpy()[:,np.newaxis], matrix))
    matrix = matrix.transpose()
    corr = np.ma.corrcoef(matrix)
    corr = np.ma.getdata(corr)
    columns = np.full((corr.shape[0],), True, dtype=bool)
    for i in range(corr.shape[0]):
        if pd.isnull(corr[0,i]):
            if columns[i]:
                columns[i] = False
        for j in range(i+1, corr.shape[0]):
            if corr[i,j] >= 0.9:
                if columns[j]:
                    columns[j] = False
    selected_columns = dfx.columns[columns[1:]]
    return selected_columns, corr

def process(xTrain, yTrain, xTest, yTest):
    xTrain, xTest = normalization(xTrain, xTest)
    selected_columns, corr = pearson_graph(xTrain, yTrain)
    xTrain = xTrain[selected_columns[1:]]
    xTest = xTest[selected_columns[1:]]
    return xTrain.to_numpy(), yTrain.to_numpy(), xTest.to_numpy(), yTest.to_numpy()

def get_csv():
    x = pd.read_csv('X.csv')
    y = pd.read_csv('Y.csv')
    # extract features of datetime column
    x = extract_features(x)
    # drop datetime column from y data
    y = y.drop(columns=['datetime'])
    return x, y

def kfold_cv(x, y, model):
    nested_train_scores = list()
    nested_test_scores = list()

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in tqdm(outer_cv.split(x)):
        # split data
        xTrain = x.iloc[train_index]
        xTest = x.iloc[test_index]
        yTrain = y.iloc[train_index]
        yTest = y.iloc[test_index]
        xTrain, yTrain, xTest, yTest = process(xTrain, yTrain, xTest, yTest)

        # PCA (number of components chosen such that the amount of variance 
        # that needs to be explained is greater than the percentage specified by n_components)
        sklearn_PCA = PCA(n_components=0.95, svd_solver='full')
        xTrain = sklearn_PCA.fit_transform(xTrain)
        xTest = sklearn_PCA.transform(xTest)
        # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest)

        md = model.fit(xTrain, yTrain)

        # Train Score
        r2 = model.score(xTrain, yTrain)
        nested_train_scores.append(r2)

        # Test Score
        r2 = model.score(xTest, yTest)
        nested_test_scores.append(r2)
    return nested_train_scores, nested_test_scores

In [72]:
x, y = get_csv()
lr = LinearRegression()
lasr = Lasso(alpha=0.1)
ridr = Ridge(alpha=60)
elr = ElasticNet(alpha=0.1, l1_ratio=0.1)

In [73]:
lr_train, lr_test = kfold_cv(x, y, lr)
lasr_train, lasr_test = kfold_cv(x, y, lasr)
ridr_train, ridr_test = kfold_cv(x, y, ridr)
elr_train, elr_test = kfold_cv(x, y, elr)

5it [01:47, 21.55s/it]
5it [01:54, 22.93s/it]
5it [02:03, 24.70s/it]
5it [02:12, 26.47s/it]


In [74]:
print("Cross validation mean R^2 scores with PCA")
print("Linear Regression (Closed):")
print("[train] {} +/- {}".format(mean(lr_train), stdev(lr_train)))
print("[test] {} +/- {}".format(mean(lr_test), stdev(lr_test)))
print("Lasso Regression:")
print("[train] {} +/- {}".format(mean(lasr_train), stdev(lasr_train)))
print("[test] {} +/- {}".format(mean(lasr_test), stdev(lasr_test)))
print("Ridge Regression:")
print("[train] {} +/- {}".format(mean(ridr_train), stdev(ridr_train)))
print("[test] {} +/- {}".format(mean(ridr_test), stdev(ridr_test)))
print("Elastic Net:")
print("[train] {} +/- {}".format(mean(elr_train), stdev(elr_train)))
print("[test] {} +/- {}".format(mean(elr_test), stdev(elr_test)))

Cross validation mean R^2 scores with PCA
Linear Regression (Closed):
[train] 0.8391422061271094 +/- 0.014876450428982392
[test] 0.30448089805210765 +/- 0.12653328431896188
Lasso Regression:
[train] 0.8089311903357289 +/- 0.014478705103926483
[test] 0.32518945304761765 +/- 0.10499498635086156
Ridge Regression:
[train] 0.8388392256079907 +/- 0.014864145355886971
[test] 0.3106787958784869 +/- 0.12438243400522321
Elastic Net:
[train] 0.8379759100466714 +/- 0.01484850971614257
[test] 0.31460748136912325 +/- 0.12194933646300489
