# Modelling

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
import pickle
import datetime

# Reading features by importance

In [2]:
features_by_importance = list(pd.read_csv('feature_counts_10d.csv',header=None)[0])
badf = pd.read_csv('data/badf.csv')

In [3]:
TARGET_COL = 'target_10d'
DATE_COL = 'date'
TARGET_THRESHOLD = 0.001

In [4]:
def badf_n_top_features(badf,features_by_importance,n,target_col):
    columns = ['date']
    columns.extend(features_by_importance[:n])
    columns.append(target_col)
    return badf.loc[:,columns].set_index('date')

def convert_target_labels(df,threshold,target_col):
    df.loc[df[target_col] > threshold,target_col] = 1
    df.loc[df[target_col] <= threshold,target_col] = 0
    return df

# badf = badf_n_top_features(badf,features_by_importance,50,TARGET_COL)
    
badf = convert_target_labels(badf,TARGET_THRESHOLD,TARGET_COL)

# Sampling Strategy

In [5]:
badf

Unnamed: 0,date,open,high,low,close,volume,rsi,bb_high,bb_low,atr,...,CDLSPINNINGTOP,CDLSTALLEDPATTERN,CDLSTICKSANDWICH,CDLTAKURI,CDLTASUKIGAP,CDLTHRUSTING,CDLTRISTAR,CDLUNIQUE3RIVER,CDLUPSIDEGAP2CROWS,CDLXSIDEGAP3METHODS
0,1999-03-10,51.125000,51.156250,50.281250,44.523476,5232000,,,,,...,0,0,0,0,0,0,0,0,0,0
1,1999-03-11,51.437500,51.734375,50.312500,44.741455,9688600,,,,,...,0,0,0,0,0,0,0,0,0,0
2,1999-03-12,51.125000,51.156250,49.656250,43.651520,8743600,,,,,...,0,0,0,0,0,0,0,0,0,0
3,1999-03-15,50.437500,51.562500,49.906250,44.904942,6369000,,,,,...,0,0,0,0,0,0,0,0,0,0
4,1999-03-16,51.718750,52.156250,51.156250,45.286419,4905800,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5453,2020-11-06,293.709991,295.359985,289.829987,294.609985,40959800,61.558118,0.018474,0.086460,7.118914,...,0,0,0,0,0,0,0,0,0,0
5454,2020-11-09,297.649994,299.140015,288.119995,288.589996,86537100,55.533787,0.035281,0.068016,7.397564,...,0,0,0,0,0,0,0,0,0,0
5455,2020-11-10,285.170013,286.660004,280.619995,283.420013,69024900,50.924553,0.047727,0.051233,7.438453,...,0,0,0,0,0,0,0,0,0,0
5456,2020-11-11,286.029999,290.250000,283.380005,289.760010,36102900,55.772406,0.026095,0.070325,7.397849,...,0,0,0,0,0,0,0,0,0,0


**Priciples**

1) Test set should be of the date after the training set. 

2) Missing Value Strategy () --? min?

3) Training Set Window - 1000 days

4) Test set - 300 days

5) Remove 2020 data

6) Holdout set 2019 data

**Variations** 

1) Normalized

2) Polynomial and Logarithmic Features

2) Non Normalized

**Model**

1) Logistic Regression

2) Decision Tree

3) Random Forest

4) SVM

5) Gradient Boosting


In [6]:
def prep_datasets(df,no_of_features,ignore_after,holdout_set_after,test_set_rows,train_set_rows,target,features_by_importance,cross_val_days):
    df = badf_n_top_features(df,features_by_importance,no_of_features,target)
    df = df[df.index <ignore_after]
    df['target_10d'] = df['target_10d'].astype('int')

    df_holdout = df[df.index >= holdout_set_after]
    df = df[df.index < holdout_set_after]

    for column in df.columns:
        if df[column].isna().any():
            df[column].fillna(df[column].min(),inplace=True)
        
    train_set = df[-test_set_rows-train_set_rows-cross_val_days:-test_set_rows-cross_val_days-1]
    test_set = df[-test_set_rows-cross_val_days-1:-cross_val_days-1]
    train_set_X = train_set.drop(columns='target_10d')
    train_set_y = train_set['target_10d']
    test_set_X = test_set.drop(columns='target_10d')
    test_set_y = test_set['target_10d']
    
    return df,train_set_X,test_set_X,train_set_y,test_set_y

In [7]:
df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,50,'2020-01-01','2019-01-01',150,1000,'target_10d',features_by_importance,150)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [9]:
ver = '1'
model_dict = {}
model_dict['models'] = []
top_n = [5,10,15,20,25,30,35,40,45,50]
cross_val = range(0,1500,150)

# Logistic Regression

In [10]:
C = [100, 10, 1.0, 0.1, 0.01]

In [11]:
for no_of_features in top_n:
    a = datetime.datetime.now()
    for c in C:
        model_params = {}
        model_params['Feature Transform'] = None
        model_params['model_type'] = 'Logistic Regression'
        model_params['C'] = c
        model_params['no_of_features'] = no_of_features
        model_params['CrossValDays'] = []
        model_params['Confusion Matrices'] = []
        model_params['AUC Scores'] = []
        model_params['TN'], model_params['FP'],model_params['FN'], model_params['TP'] = [],[],[],[]
        for cross_val_days in cross_val:
            df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,no_of_features,'2020-01-01','2019-01-01',300,1000,'target_10d',features_by_importance,cross_val_days)
            model = LogisticRegression(C=c,max_iter=1000)
            model.fit(train_set_X,train_set_y)
            y_pred = model.predict(test_set_X)
            y_pred_proba = model.predict_proba(test_set_X)
            
            model_params['CrossValDays'].append(cross_val_days)
            
            model_params['Confusion Matrices'].append(confusion_matrix(test_set_y,y_pred))
            model_params['TN'].append(confusion_matrix(test_set_y,y_pred).ravel()[0])
            model_params['FP'].append(confusion_matrix(test_set_y,y_pred).ravel()[1])
            model_params['FN'].append(confusion_matrix(test_set_y,y_pred).ravel()[2])
            model_params['TP'].append(confusion_matrix(test_set_y,y_pred).ravel()[3])
            model_params['AUC Scores'].append(roc_auc_score(test_set_y,y_pred_proba[:,1]))
            
        model_params['AUC Score'] = sum(model_params['AUC Scores'])/len(model_params['AUC Scores'])
        model_dict['models'].append(model_params)
    b = datetime.datetime.now()
    print('No of Features: {}, Time Taken: {}'.format(no_of_features,b-a))

No of Features: 5, Time Taken: 0:00:01.031361
No of Features: 10, Time Taken: 0:00:01.162737
No of Features: 15, Time Taken: 0:00:01.324077
No of Features: 20, Time Taken: 0:00:01.719213
No of Features: 25, Time Taken: 0:00:02.510121
No of Features: 30, Time Taken: 0:00:03.445817
No of Features: 35, Time Taken: 0:00:03.801287
No of Features: 40, Time Taken: 0:00:06.281485
No of Features: 45, Time Taken: 0:00:06.519222
No of Features: 50, Time Taken: 0:00:06.586983


In [12]:
pickle.dump(model_dict,open('model_perf.pkl_'+ver,'wb'))

In [13]:
model_df = pd.DataFrame(model_dict['models'])
model_df.to_csv('model_perf.csv')


# Decision Tree Classifier

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
min_leaves_comb = [2,4,8,16,32,64,128,256]

for no_of_features in top_n:
    a = datetime.datetime.now()
    for min_leaves in min_leaves_comb:
        model_params = {}
        model_params['Feature Transform'] = None
        model_params['model_type'] = 'Decision Tree Regressor'
        model_params['Min Leaves'] = min_leaves
        model_params['no_of_features'] = no_of_features
        model_params['CrossValDays'] = []
        model_params['Confusion Matrices'] = []
        model_params['AUC Scores'] = []
        model_params['TN'], model_params['FP'],model_params['FN'], model_params['TP'] = [],[],[],[]
        for cross_val_days in cross_val:
            df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,no_of_features,'2020-01-01','2019-01-01',300,1000,'target_10d',features_by_importance,cross_val_days)
            model = DecisionTreeClassifier(min_samples_leaf=min_leaves)
            model.fit(train_set_X,train_set_y)
            y_pred = model.predict(test_set_X)
            y_pred_proba = model.predict_proba(test_set_X)
            model_params['CrossValDays'].append(cross_val_days)
            
            model_params['Confusion Matrices'].append(confusion_matrix(test_set_y,y_pred))
            model_params['TN'].append(confusion_matrix(test_set_y,y_pred).ravel()[0])
            model_params['FP'].append(confusion_matrix(test_set_y,y_pred).ravel()[1])
            model_params['FN'].append(confusion_matrix(test_set_y,y_pred).ravel()[2])
            model_params['TP'].append(confusion_matrix(test_set_y,y_pred).ravel()[3])
            model_params['AUC Scores'].append(roc_auc_score(test_set_y,y_pred_proba[:,-1]))

        model_params['AUC Score'] = sum(model_params['AUC Scores'])/len(model_params['AUC Scores'])
        model_dict['models'].append(model_params)
    b = datetime.datetime.now()
    print('No of Features: {}, Time Taken: {}'.format(no_of_features,b-a))

No of Features: 5, Time Taken: 0:00:01.101910
No of Features: 10, Time Taken: 0:00:01.385664
No of Features: 15, Time Taken: 0:00:01.556189
No of Features: 20, Time Taken: 0:00:01.773286
No of Features: 25, Time Taken: 0:00:01.945496
No of Features: 30, Time Taken: 0:00:02.100055
No of Features: 35, Time Taken: 0:00:02.265169
No of Features: 40, Time Taken: 0:00:02.306235
No of Features: 45, Time Taken: 0:00:02.393168
No of Features: 50, Time Taken: 0:00:02.436649


In [16]:
model_df = pd.DataFrame(model_dict['models'])
model_df.to_csv('model_perf.csv')
pickle.dump(model_dict,open('model_perf.pkl_'+ver,'wb'))

# K Nearest Neighbors

In [17]:
from sklearn.neighbors import KNeighborsClassifier

In [18]:
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metrics = ['euclidean', 'manhattan', 'minkowski']

for no_of_features in top_n:
    a = datetime.datetime.now()
    for n_neighbor in n_neighbors:
        for weight in weights:
            for metric in metrics:
                model_params = {}
                model_params['Feature Transform'] = None
                model_params['model_type'] = 'K Neighbors'
                model_params['Number of Neighbors'] = n_neighbor
                model_params['metric'] = metric
                model_params['weight'] = weight
                model_params['no_of_features'] = no_of_features
                model_params['CrossValDays'] = []
                model_params['Confusion Matrices'] = []
                model_params['AUC Scores'] = []
                model_params['TN'], model_params['FP'],model_params['FN'], model_params['TP'] = [],[],[],[]
                for cross_val_days in cross_val:
                    df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,no_of_features,'2020-01-01','2019-01-01',300,1000,'target_10d',features_by_importance,cross_val_days)
                    model = KNeighborsClassifier(n_neighbors=n_neighbor,weights=weight,metric=metric,n_jobs=-1)
                    model.fit(train_set_X,train_set_y)
                    y_pred = model.predict(test_set_X)
                    y_pred_proba = model.predict_proba(test_set_X)
                    model_params['CrossValDays'].append(cross_val_days)
            
                    model_params['Confusion Matrices'].append(confusion_matrix(test_set_y,y_pred))
                    model_params['TN'].append(confusion_matrix(test_set_y,y_pred).ravel()[0])
                    model_params['FP'].append(confusion_matrix(test_set_y,y_pred).ravel()[1])
                    model_params['FN'].append(confusion_matrix(test_set_y,y_pred).ravel()[2])
                    model_params['TP'].append(confusion_matrix(test_set_y,y_pred).ravel()[3])
                    model_params['AUC Scores'].append(roc_auc_score(test_set_y,y_pred_proba[:,-1]))

                model_params['AUC Score'] = sum(model_params['AUC Scores'])/len(model_params['AUC Scores'])
                model_dict['models'].append(model_params)
    b = datetime.datetime.now()
    print('No of Features: {}, Time Taken: {}'.format(no_of_features,b-a))

No of Features: 5, Time Taken: 0:00:14.098998
No of Features: 10, Time Taken: 0:00:14.809068
No of Features: 15, Time Taken: 0:00:15.502108
No of Features: 20, Time Taken: 0:00:19.399188
No of Features: 25, Time Taken: 0:00:19.425201
No of Features: 30, Time Taken: 0:00:20.176708
No of Features: 35, Time Taken: 0:00:21.367370
No of Features: 40, Time Taken: 0:00:21.705380
No of Features: 45, Time Taken: 0:00:23.235123
No of Features: 50, Time Taken: 0:00:23.814029


In [19]:
model_df = pd.DataFrame(model_dict['models'])
model_df.to_csv('model_perf.csv')
pickle.dump(model_dict,open('model_perf.pkl_'+ver,'wb'))

# Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
# 500 and 1000 trees were removed

In [22]:
min_leaves_comb = range(1,20,2)
no_of_trees_comb = [100,200,300]

for no_of_features in top_n:
    a = datetime.datetime.now()
    for no_of_trees in no_of_trees_comb:
        a = datetime.datetime.now()
        for min_leaves in min_leaves_comb:
            model_params = {}
            model_params['Feature Transform'] = None
            model_params['model_type'] = 'Random Forest'
            model_params['no_of_trees'] = no_of_trees
            model_params['min_leaves'] = min_leaves
            model_params['no_of_features'] = no_of_features
            model_params['CrossValDays'] = []
            model_params['Confusion Matrices'] = []
            model_params['AUC Scores'] = []
            model_params['TN'], model_params['FP'],model_params['FN'], model_params['TP'] = [],[],[],[]

            for cross_val_days in cross_val:
                df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,no_of_features,'2020-01-01','2019-01-01',300,1000,'target_10d',features_by_importance,cross_val_days)
                model = RandomForestClassifier(n_estimators=no_of_trees,min_samples_leaf=min_leaves,n_jobs=-1)
                model.fit(train_set_X,train_set_y)
                y_pred = model.predict(test_set_X)
                y_pred_proba = model.predict_proba(test_set_X)
                
                model_params['CrossValDays'].append(cross_val_days)
            
                model_params['Confusion Matrices'].append(confusion_matrix(test_set_y,y_pred))
                model_params['TN'].append(confusion_matrix(test_set_y,y_pred).ravel()[0])
                model_params['FP'].append(confusion_matrix(test_set_y,y_pred).ravel()[1])
                model_params['FN'].append(confusion_matrix(test_set_y,y_pred).ravel()[2])
                model_params['TP'].append(confusion_matrix(test_set_y,y_pred).ravel()[3])
                model_params['AUC Scores'].append(roc_auc_score(test_set_y,y_pred_proba[:,-1]))
                
            model_params['AUC Score'] = sum(model_params['AUC Scores'])/len(model_params['AUC Scores'])
            model_dict['models'].append(model_params)
        b = datetime.datetime.now()
        print('No of Features: {}, No of Trees: {}, Time Taken: {}'.format(no_of_features,no_of_trees,b-a))

No of Features: 5, No of Trees: 100, Time Taken: 0:00:17.162661
No of Features: 5, No of Trees: 200, Time Taken: 0:00:31.098539
No of Features: 5, No of Trees: 300, Time Taken: 0:00:47.014383
No of Features: 10, No of Trees: 100, Time Taken: 0:00:16.883892
No of Features: 10, No of Trees: 200, Time Taken: 0:00:32.182602
No of Features: 10, No of Trees: 300, Time Taken: 0:00:45.821934
No of Features: 15, No of Trees: 100, Time Taken: 0:00:16.680396
No of Features: 15, No of Trees: 200, Time Taken: 0:00:31.835552
No of Features: 15, No of Trees: 300, Time Taken: 0:00:46.321055
No of Features: 20, No of Trees: 100, Time Taken: 0:00:17.030165
No of Features: 20, No of Trees: 200, Time Taken: 0:00:31.758068
No of Features: 20, No of Trees: 300, Time Taken: 0:00:46.493627
No of Features: 25, No of Trees: 100, Time Taken: 0:00:17.670642
No of Features: 25, No of Trees: 200, Time Taken: 0:00:32.624058
No of Features: 25, No of Trees: 300, Time Taken: 0:00:45.795809
No of Features: 30, No of Tr

In [23]:
model_df = pd.DataFrame(model_dict['models'])
model_df.to_csv('model_perf.csv')
pickle.dump(model_dict,open('model_perf.pkl_'+ver,'wb'))

# SVM Classifier

In [24]:
from sklearn.svm import SVC

In [25]:
C_comb = [0.01,0.1,1,10]
kernel_comb = ['linear', 'poly', 'rbf', 'sigmoid']

In [26]:
#C = 100 was removed because of time constraint more than 3 minute per run

In [27]:

for no_of_features in top_n:
    
    for C in C_comb:
        a = datetime.datetime.now()
        for kernel in kernel_comb:
            model_params = {}
            model_params['Feature Transform'] = None
            model_params['model_type'] = 'Support Vector Classifier'
            model_params['C'] = C
            model_params['Kernel'] = kernel
            model_params['no_of_features'] = no_of_features
            model_params['CrossValDays'] = []
            model_params['Confusion Matrices'] = []
            model_params['AUC Scores'] = []
            model_params['TN'], model_params['FP'],model_params['FN'], model_params['TP'] = [],[],[],[]

            for cross_val_days in cross_val:
                df,train_set_X,test_set_X,train_set_y,test_set_y = prep_datasets(badf,no_of_features,'2020-01-01','2019-01-01',300,1000,'target_10d',features_by_importance,cross_val_days)
                model = SVC(C=C,kernel=kernel,probability=True)
                model.fit(train_set_X,train_set_y)
                y_pred = model.predict(test_set_X)
                y_pred_proba = model.predict_proba(test_set_X)
                model_params['CrossValDays'].append(cross_val_days)
                model_params['Confusion Matrices'].append(confusion_matrix(test_set_y,y_pred))
                model_params['TN'].append(confusion_matrix(test_set_y,y_pred).ravel()[0])
                model_params['FP'].append(confusion_matrix(test_set_y,y_pred).ravel()[1])
                model_params['FN'].append(confusion_matrix(test_set_y,y_pred).ravel()[2])
                model_params['TP'].append(confusion_matrix(test_set_y,y_pred).ravel()[3])
                model_params['AUC Scores'].append(roc_auc_score(test_set_y,y_pred_proba[:,-1]))

            model_params['AUC Score'] = sum(model_params['AUC Scores'])/len(model_params['AUC Scores'])
            model_dict['models'].append(model_params)

        b = datetime.datetime.now()
        print('No of Features: {}, C : {}, Time Taken: {}'.format(no_of_features,C,b-a))

No of Features: 5, C : 0.01, Time Taken: 0:00:03.573751
No of Features: 5, C : 0.1, Time Taken: 0:00:03.560154
No of Features: 5, C : 1, Time Taken: 0:00:03.577399
No of Features: 5, C : 10, Time Taken: 0:00:03.846403
No of Features: 10, C : 0.01, Time Taken: 0:00:04.325778
No of Features: 10, C : 0.1, Time Taken: 0:00:04.330179
No of Features: 10, C : 1, Time Taken: 0:00:04.511310
No of Features: 10, C : 10, Time Taken: 0:00:06.037262
No of Features: 15, C : 0.01, Time Taken: 0:00:05.004194
No of Features: 15, C : 0.1, Time Taken: 0:00:05.117459
No of Features: 15, C : 1, Time Taken: 0:00:05.234671
No of Features: 15, C : 10, Time Taken: 0:00:06.616879
No of Features: 20, C : 0.01, Time Taken: 0:00:06.320526
No of Features: 20, C : 0.1, Time Taken: 0:00:09.430450
No of Features: 20, C : 1, Time Taken: 0:00:35.217844
No of Features: 20, C : 10, Time Taken: 0:04:10.701088
No of Features: 25, C : 0.01, Time Taken: 0:00:07.855409
No of Features: 25, C : 0.1, Time Taken: 0:00:11.804938
No 

In [28]:
model_df = pd.DataFrame(model_dict['models'])
model_df.to_csv('model_perf.csv')
pickle.dump(model_dict,open('model_perf.pkl_'+ver,'wb'))