In [1]:
import pandas as pd
import numpy as np

In [2]:
# import pre-processed CSV files

precovid_df = pd.read_csv('data/precovid.csv')
postcovid_df = pd.read_csv('data/postcovid.csv')

In [3]:
precovid_df.head(2)

Unnamed: 0,Date,Index,Open,High,Low,Close,Adj Close,Volume,Market Closed,Adj Close Moving Average 10,Adj Close Moving Average 21
0,2018-01-02,GSPTSE,16213.40039,16310.0,16181.0,16310.0,16310.0,15707340000.0,False,16310.0,16310.0
1,2018-01-03,GSPTSE,16336.7002,16386.30078,16322.59961,16371.59961,16371.59961,19686540000.0,False,16340.799805,16340.799805


In [4]:
precovid_df.info()

# shows us no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2364 entries, 0 to 2363
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Date                         2364 non-null   object 
 1   Index                        2364 non-null   object 
 2   Open                         2364 non-null   float64
 3   High                         2364 non-null   float64
 4   Low                          2364 non-null   float64
 5   Close                        2364 non-null   float64
 6   Adj Close                    2364 non-null   float64
 7   Volume                       2364 non-null   float64
 8   Market Closed                2364 non-null   bool   
 9   Adj Close Moving Average 10  2364 non-null   float64
 10  Adj Close Moving Average 21  2364 non-null   float64
dtypes: bool(1), float64(8), object(2)
memory usage: 187.1+ KB


In [5]:
# show unique values in the 'Index' column for precovid data
unique_indexes = np.sort(precovid_df['Index'].unique())
print("Unique Index count:", len(unique_indexes))
print(unique_indexes)

Unique Index count: 3
['GSPTSE' 'IXIC' 'NYA']


In [6]:
# show unique values in the 'Index' column for post covid data
unique_indexes = np.sort(postcovid_df['Index'].unique())
print("Unique Index count:", len(unique_indexes))
print(unique_indexes)

Unique Index count: 3
['GSPTSE' 'IXIC' 'NYA']


DEFINE TARGET FEATURES

In [8]:
# defining the features and selecting predictor columns (inputs) and the label (outputs)

# create binary target, use comparison to determine if price went up or down the next day
# convert to a 1 (True) if price went up, or 0 (False) if it did not or went down
precovid_df['Target'] = (precovid_df['Adj Close'].shift(-1) > precovid_df['Adj Close']).astype(int)

# drop the last row (or any rows with NaN in Target)
precovid_df = precovid_df.dropna(subset=['Target'])

# define features and target

X = precovid_df[['Adj Close Moving Average 10', 
                       'Adj Close Moving Average 21', 
                       'Volume',
                       'Index']]

y = precovid_df['Target']


SPLIT DATA INTO TRAIN AND TEST

In [9]:
# separate the data for model training and testing, keeping time order intact

# taking the first 80% of the dates in the dataset to be used in the training
# remaining 20% to be used in testing
split_index = int(len(precovid_df) * 0.8)

X_train = X.iloc[:split_index]  # first 80% of dates
y_train = y.iloc[:split_index]

X_test = X.iloc[split_index:]   # last 20% of dates
y_test = y.iloc[split_index:]

PREPROCESSING PIPELINE

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [11]:
# create a preprocessing pipeline that will standardize the numeric data

# build the transformer
transformer = ColumnTransformer(                                                        # using columntransformer to apply preprocessing to specific columns
    transformers=[
        ('numeric_transformer', StandardScaler(),                                       # standardize the following columns
         ['Adj Close Moving Average 10', 'Adj Close Moving Average 21', 'Volume']),
        ('onehot', OneHotEncoder(handle_unknown='ignore'), ['Index'])                   # telling onehotencoder to ignore unknown categories
    ],
    remainder='drop'                                                                    # drop remaining columns
)

preproc = Pipeline([('preproc', transformer)])      # wrapping columntransformer in preprocessing pipeline

PREPROCESSING AND CROSS VALIDATION

In [12]:
rf = RandomForestClassifier(
    class_weight='balanced',            # handle imbalanced classes
    n_estimators=500,
    max_depth=10,
    min_samples_leaf=5,
    random_state=42
)

stocks_pipeline = Pipeline([
    ('preprocess', preproc),
    ('regressor', rf)
])

# Hyperparameters

stocks_param_grid = {
    'regressor__n_estimators': [200, 500, 1000],
    'regressor__max_depth': [5, 10, 15],
    'regressor__min_samples_leaf': [5, 10]
}

stocks_cv = GridSearchCV(
    estimator=stocks_pipeline,
    param_grid=stocks_param_grid,
    cv=TimeSeriesSplit(n_splits=5),                               # 5 fold cross validation
    n_jobs=1
)

FIT THE MODEL

In [13]:
stocks_cv.fit(X_train, y_train)

# testing predictions
y_proba = stocks_cv.predict_proba(X_test)[:, 1]

# setting a custom threshold (the model tended to underestimate 1s)
threshold = 0.5
y_pred_custom = (y_proba >= threshold).astype(int)

In [14]:
from sklearn.metrics import roc_curve, auc

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_proba)        # using true labels and predicted probabilities
roc_auc = auc(fpr, tpr)

print(roc_auc)

0.5607208448117539


In [15]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred_custom = (y_proba >= threshold).astype(int)

# compute confusion matrix values using custom threshold predictions
cm = confusion_matrix(y_test, y_pred_custom)

# some additional metrics for clarity
print("Confusion Matrix:\n", cm)
tn, fp, fn, tp = cm.ravel()
print(f"True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}, True Positives: {tp}")

from sklearn.metrics import classification_report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_custom, target_names=['Down (0)', 'Up (1)']))


Confusion Matrix:
 [[195 102]
 [ 98  78]]
True Negatives: 195, False Positives: 102, False Negatives: 98, True Positives: 78

Classification Report:
              precision    recall  f1-score   support

    Down (0)       0.67      0.66      0.66       297
      Up (1)       0.43      0.44      0.44       176

    accuracy                           0.58       473
   macro avg       0.55      0.55      0.55       473
weighted avg       0.58      0.58      0.58       473



INPUTTING THE POST-COVID DATA

In [16]:
# create target column in postcovid_df
postcovid_df['Target'] = (postcovid_df['Adj Close'].shift(-1) > postcovid_df['Adj Close']).astype(int)

# drop any rows with missing target
postcovid_df = postcovid_df.dropna(subset=['Target'])

In [17]:
X_post = postcovid_df[['Adj Close Moving Average 10', 
                       'Adj Close Moving Average 21',
                       'Volume',
                       'Index']]  # including categorical column

y_post = postcovid_df['Target']  

In [18]:
# Predict probabilities and class labels on post-COVID data
y_post_proba = stocks_cv.predict_proba(X_post)[:, 1]  # probability of class 1
y_post_pred = (y_post_proba >= 0.5).astype(int)       # use your preferred threshold

In [19]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score

# results from the post-covid data in our model

auc_post = roc_auc_score(y_post, y_post_proba)
precision_post = precision_score(y_post, y_post_pred)
recall_post = recall_score(y_post, y_post_pred)

print(f"AUC (post-COVID): {auc_post:.3f}")
print(f"Precision (post-COVID): {precision_post:.3f}")
print(f"Recall (post-COVID): {recall_post:.3f}")

AUC (post-COVID): 0.526
Precision (post-COVID): 0.451
Recall (post-COVID): 0.174


EXPORT THE MODEL TO A PICKLE FILE

In [20]:
import pickle

# Save the trained model to a file
with open('stock_model.pkl', 'wb') as f:
    pickle.dump(stocks_cv, f)