In [1]:
import pandas as pd
import numpy as np

In [2]:
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objs as go
import matplotlib.ticker as mtick
from plotly.offline import iplot
import plotly.offline as offline
offline.init_notebook_mode(connected=True)


# Financial Data Analysis
import yfinance as yf
import ta
import quantstats as qs


In [3]:
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [4]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import yfinance as yf

In [7]:
# Loading apple stocks until March 24th, 2023
aapl = yf.download('AAPL', end = '2023-03-24')
aapl

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980-12-12,0.128348,0.128906,0.128348,0.128348,0.099192,469033600
1980-12-15,0.122210,0.122210,0.121652,0.121652,0.094017,175884800
1980-12-16,0.113281,0.113281,0.112723,0.112723,0.087117,105728000
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089273,86441600
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.091861,73449600
...,...,...,...,...,...,...
2023-03-17,156.080002,156.740005,154.279999,155.000000,154.177048,98944600
2023-03-20,155.070007,157.820007,154.149994,157.399994,156.564301,73641400
2023-03-21,157.320007,159.399994,156.539993,159.279999,158.434311,73938300
2023-03-22,159.300003,162.139999,157.809998,157.830002,156.992020,75701800


In [None]:
############## STrategy 1 to determine the target signal based on the Return varaible#################

In [9]:
from sklearn.model_selection import train_test_split

# Splitting data into train and test sets
train, test = train_test_split(aapl, test_size=0.2, random_state=25)

# Optionally, you can reset the index for both train and test sets
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)


In [10]:
# Adding Close_Shift Variable 
train['Close_Shift'] = train['Adj Close'].shift(1)
test['Close_Shift'] = test['Adj Close'].shift(1)

# Adding Daily Returns variable 
train['Return'] = (train['Adj Close']/train['Close_Shift'] - 1) * 100
test['Return'] = (test['Adj Close']/test['Close_Shift'] - 1) * 100

In [11]:
train=train.dropna()
test=test.dropna()

In [12]:
train # Visualizing Training dataset

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return
1,0.084263,0.084263,0.083705,0.083705,0.064690,13596800,69.572983,-99.907018
2,40.130001,40.177502,39.459999,39.465000,37.137611,111762400,0.064690,57308.282828
3,1.166295,1.200893,1.100446,1.117746,0.946283,259627200,37.137611,-97.451954
4,25.145000,25.427500,25.112499,25.375000,23.196978,147822800,0.946283,2351.377576
5,0.367188,0.376116,0.360491,0.370536,0.313696,604128000,23.196978,-98.647687
...,...,...,...,...,...,...,...,...
8522,23.532499,23.690001,23.367500,23.465000,20.783775,182724000,0.052615,39401.793783
8523,0.404018,0.406250,0.392857,0.395089,0.320598,161974400,20.783775,-98.457459
8524,171.850006,173.339996,170.050003,172.550003,170.600449,61177400,0.320598,53113.162363
8525,3.144643,3.177500,3.123214,3.132857,2.652275,626284400,170.600449,-98.445329


In [13]:
test # Visualizing Testing dataset

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return
1,3.637500,3.685714,3.580357,3.629286,3.072552,500180800,15.163658,-79.737396
2,0.276786,0.279018,0.263393,0.270089,0.223737,132899200,3.072552,-92.718205
3,0.334821,0.337054,0.329241,0.330915,0.256133,97104000,0.223737,14.479414
4,5.858929,5.887143,5.752857,5.817500,4.925093,1504190800,0.256133,1822.867434
5,0.199777,0.200893,0.198661,0.200893,0.170076,157248000,4.925093,-96.546747
...,...,...,...,...,...,...,...,...
2127,5.892143,5.954286,5.878929,5.940357,5.029103,342031200,0.286294,1656.619137
2128,21.617857,21.924999,21.575357,21.924643,18.561388,379405600,5.029103,269.079476
2129,72.482498,73.419998,72.379997,73.412498,71.429642,100805600,18.561388,284.829204
2130,0.359375,0.361607,0.350446,0.354911,0.276205,215040000,71.429642,-99.613319


In [14]:
# Creating target variable on both datasets
train['target'] = np.where(train['Return'].shift(-1) > 0, 1, 0)
test['target'] = np.where(test['Return'].shift(-1) > 0, 1, 0)

In [15]:
train.head(15) # Visualizing training dataframe 

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return,target
1,0.084263,0.084263,0.083705,0.083705,0.06469,13596800,69.572983,-99.907018,1
2,40.130001,40.177502,39.459999,39.465,37.137611,111762400,0.06469,57308.282828,0
3,1.166295,1.200893,1.100446,1.117746,0.946283,259627200,37.137611,-97.451954,1
4,25.145,25.4275,25.112499,25.375,23.196978,147822800,0.946283,2351.377576,0
5,0.367188,0.376116,0.360491,0.370536,0.313696,604128000,23.196978,-98.647687,1
6,47.087502,47.247501,46.48,46.6325,44.89307,126585600,0.313696,14211.02646,0
7,18.023571,18.19857,17.928572,18.195,15.40387,460398400,44.89307,-65.687645,1
8,31.5375,31.797501,31.467501,31.502501,28.392502,138776800,15.40387,84.32058,0
9,15.998571,16.418215,15.982143,16.261429,14.155439,399380800,28.392502,-50.143741,0
10,0.356027,0.366071,0.34933,0.361607,0.306136,409001600,14.155439,-97.837324,1


In [16]:
X_train = train.drop('target', axis = 1) # Selecting Predictor Variables
y_train = train.target # Selecting Target Variable
X_test = test.drop('target', axis = 1) # Selecting Predictor Variables
y_test = test.target # Selecting Target Variable

In [17]:
X_train # Visualizing predictor variables in the testing set

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return
1,0.084263,0.084263,0.083705,0.083705,0.064690,13596800,69.572983,-99.907018
2,40.130001,40.177502,39.459999,39.465000,37.137611,111762400,0.064690,57308.282828
3,1.166295,1.200893,1.100446,1.117746,0.946283,259627200,37.137611,-97.451954
4,25.145000,25.427500,25.112499,25.375000,23.196978,147822800,0.946283,2351.377576
5,0.367188,0.376116,0.360491,0.370536,0.313696,604128000,23.196978,-98.647687
...,...,...,...,...,...,...,...,...
8522,23.532499,23.690001,23.367500,23.465000,20.783775,182724000,0.052615,39401.793783
8523,0.404018,0.406250,0.392857,0.395089,0.320598,161974400,20.783775,-98.457459
8524,171.850006,173.339996,170.050003,172.550003,170.600449,61177400,0.320598,53113.162363
8525,3.144643,3.177500,3.123214,3.132857,2.652275,626284400,170.600449,-98.445329


In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Creating a dictionary of different classification models 
classifiers = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machines': SVC(probability=True),
    'XGBoost': XGBClassifier()
}

# Extracting only the features for prediction from X_test
X_test_features = X_test  # No need to drop columns if they're not present in X_test

# Iterating over classifiers in the dictionary above, training, and evaluating them
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    if hasattr(clf, "predict_proba"):
        y_pred = clf.predict_proba(X_test_features)[:, 1]
    else:
        y_pred = clf.predict(X_test_features)
    auc_score = roc_auc_score(y_test, y_pred)
    print(f'{name}: AUC Score={auc_score:.3f}')


K-Nearest Neighbors: AUC Score=0.536
Random Forest: AUC Score=0.800
Gradient Boosting: AUC Score=0.825
Support Vector Machines: AUC Score=0.547
XGBoost: AUC Score=0.807


In [None]:
###################### IMplementy strategy 2 to add SMA variables and perform classification model##############

In [19]:
# Defining feature engineering function
def feature_engineering(df):
    
    # Adding Simple Moving Averages
  
    df['sma50'] = ta.trend.sma_indicator(df['Adj Close'],window = 50) 
    df['sma200'] = ta.trend.sma_indicator(df['Adj Close'],window = 200)
    
    # Adding Price to Simple Moving Averages ratios

    
    df['sma50_ratio'] = df['Adj Close'] / df['sma50']  
    df['sma200_ratio'] = df['Adj Close'] / df['sma200']  
  
    df['buy_signal'] = (df['sma50'] > df['sma200']).astype(int)
    
    # Removing NaN values from the dataframe 
    df.dropna(inplace = True)
    return df

In [20]:
# Applying function to the X_train and X_test sets 
X_train = feature_engineering(X_train)
X_test = feature_engineering(X_test)

In [21]:
X_train.head() # Displaying training predictor variables 

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return,sma50,sma200,sma50_ratio,sma200_ratio,buy_signal
200,81.047501,81.555,80.837502,81.217499,79.211395,94747600,0.255274,30929.907072,11.424464,15.703419,6.933489,5.044213,0
201,0.325893,0.339286,0.325893,0.328125,0.260383,120232000,79.211395,-99.671281,11.38837,15.704397,0.022864,0.01658,0
202,0.141183,0.141741,0.133371,0.133929,0.103505,182022400,0.260383,-60.248822,11.379679,15.519227,0.009096,0.006669,0
203,79.665001,80.222504,78.967499,79.212502,77.46505,102688800,0.103505,74741.616533,12.923273,15.901821,5.994228,4.871458,0
204,0.294643,0.310357,0.291964,0.301607,0.25534,376818400,77.46505,-99.67038,12.922075,15.787112,0.01976,0.016174,0


In [22]:
X_test.head() #  Displaying testing predictor variables 

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_Shift,Return,sma50,sma200,sma50_ratio,sma200_ratio,buy_signal
200,43.662498,43.892502,43.212502,43.560001,41.756073,112958400,36.758842,13.594635,11.604247,16.283972,3.598344,2.564244,0
201,14.55,14.595,14.216071,14.376429,12.432477,480746000,41.756073,-70.225943,11.33612,16.330772,1.096714,0.761291,0
202,21.681786,21.73,21.485001,21.638571,18.319199,418930400,12.432477,47.349548,11.695634,16.421249,1.566328,1.115579,0
203,0.199219,0.199777,0.189732,0.195313,0.150945,152678400,18.319199,-99.176028,9.831228,16.420723,0.015354,0.009192,0
204,0.165179,0.165179,0.162388,0.164621,0.127225,84649600,0.150945,-15.714247,9.54665,16.396734,0.013327,0.007759,0


In [23]:
# Removing from y_train and y_test the NaN values that were dropped from X_train and X_test by the index
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]

In [24]:
# Visualizing all predictor variables 
X_train.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Close_Shift',
       'Return', 'sma50', 'sma200', 'sma50_ratio', 'sma200_ratio',
       'buy_signal'],
      dtype='object')

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Creating a dictionary of different classification models 
classifiers = {
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machines': SVC(probability=True),
    'XGBoost': XGBClassifier()
}

# Extracting only the features for prediction from X_test
X_test_features = X_test  # No need to drop columns if they're not present in X_test

# Iterating over classifiers in the dictionary above, training, and evaluating them
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    if hasattr(clf, "predict_proba"):
        y_pred = clf.predict_proba(X_test_features)[:, 1]
    else:
        y_pred = clf.predict(X_test_features)
    auc_score = roc_auc_score(y_test, y_pred)
    print(f'{name}: AUC Score={auc_score:.3f}')


K-Nearest Neighbors: AUC Score=0.542
Random Forest: AUC Score=0.814
Gradient Boosting: AUC Score=0.826
Support Vector Machines: AUC Score=0.554
XGBoost: AUC Score=0.797


In [26]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for Random Forest and Gradient Boosting classifiers
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search for Random Forest
rf_grid_search = GridSearchCV(RandomForestClassifier(), param_grid=rf_param_grid, cv=3, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Grid search for Gradient Boosting
gb_grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid=gb_param_grid, cv=3, n_jobs=-1)
gb_grid_search.fit(X_train, y_train)

# Print best parameters and accuracy for Random Forest
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
print("Random Forest Best Accuracy:", rf_grid_search.best_score_)

# Print best parameters and accuracy for Gradient Boosting
print("Gradient Boosting Best Parameters:", gb_grid_search.best_params_)
print("Gradient Boosting Best Accuracy:", gb_grid_search.best_score_)


Random Forest Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}
Random Forest Best Accuracy: 0.7319565386712362
Gradient Boosting Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Gradient Boosting Best Accuracy: 0.7382018762278127
