## Load Libraries

In [46]:
import pandas as pd
import numpy as np
import random as rnd
import datetime

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_style('whitegrid')
%matplotlib inline

# plotly
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cufflinks as cf
cf.go_offline()

from fastai.tabular.all import *

# SKlearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from time import time


from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone

from tune_sklearn import TuneSearchCV
from tabulate import tabulate

## Load Data

In [47]:
filepath = "/Users/Kapil/PycharmProjects/stock-price"
train = pd.read_csv(filepath + "/data/train.csv")
test = pd.read_csv(filepath + "/data/test.csv")

In [48]:
SEED = 13
np.random.seed(SEED)

In [49]:
Y_train = train['Up']
#X_train = train.drop(['Up'], axis = 1)

## Functions
#### Preprocessing function

In [50]:
def LagPreproc(df):
    data = pd.DataFrame()
    data['Today'] = df['Close'].pct_change() * 100
    #data = data.rename("Today")
    for i in range(1,6):
        data['Lag'+str(i)] = data['Today'].shift(i)
    data = data.fillna(data.mean())
    return data

#### Specificity function

In [51]:
def specificity(y_truth, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()

    specificity = tn / (tn+fp)
    return specificity

#### Fit and Score function

In [52]:
def fit_and_score(model, train_index, valid_index, X, Y):
    """
    function to use in a joblib parallel env, takes the inputs and returns metrics
    return: (roc_train, roc_val, f1_train, f1_val)
    """
    # get X and y vectors
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]

    # over sample x
    #x_train, y_train = ros.fit_resample(x_train, y_train)

    # preprocess
#     x_train = ct.fit_transform(x_train)
#     x_valid = ct.transform(x_valid)

    # train model
    model.fit(x_train, y_train)

    # append scores
    y_train_preds = model.predict(x_train)
    y_valid_preds = model.predict(x_valid)

    f1_train = f1_score(y_train, y_train_preds)
    roc_train = roc_auc_score(y_train, model.predict_proba(x_train)[:,1])
    mcc_train = matthews_corrcoef(y_train, y_train_preds)
    acc_train = accuracy_score(y_train, y_train_preds)

    f1_val = f1_score(y_valid, y_valid_preds)
    roc_val = roc_auc_score(y_valid, model.predict_proba(x_valid)[:,1])
    mcc_val = matthews_corrcoef(y_valid, y_valid_preds)
    acc_val= accuracy_score(y_train, y_train_preds)

    log_line = 'Train score: ' + str(acc_train) + ' Validation score: ' + str(roc_val)
    tqdm.write(log_line)

    return roc_train, roc_val, f1_train, f1_val, mcc_train, mcc_val

#### Submission format function

In [None]:
def submitformat(df, test_df):
    df[df > 0.5] = 1
    df[df <= 0.5] = 0
    df = pd.DataFrame(df)
    df.index = test_df.index
    df = df.iloc[5::6, :]
    df = df.astype(int)
    return df

### Preprocess train and test

In [53]:
X_train  = LagPreproc(train)
X_test = LagPreproc(test)

In [61]:
X_train.head()

Unnamed: 0,Today,Lag1,Lag2,Lag3,Lag4,Lag5
0,0.043343,0.043086,0.042972,0.043069,0.04265,0.042789
1,0.138013,0.043086,0.042972,0.043069,0.04265,0.042789
2,0.708801,0.138013,0.042972,0.043069,0.04265,0.042789
3,0.703812,0.708801,0.138013,0.043069,0.04265,0.042789
4,-0.446515,0.703812,0.708801,0.138013,0.04265,0.042789


## Models
#### Logistic Regression

In [54]:
model = LogisticRegression(solver = 'liblinear', C = 0.01, penalty = 'l1')

#### Decision Tree tuning

In [None]:
config = {
    'min_samples_leaf': [15, 19, 23, 25, 28, 30, 35, 40],
    "max_depth": [5, 8, 10, 12]
}
model = TuneSearchCV(DecisionTreeClassifier(random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=-1,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)



#### Decision Tree

In [55]:
model = DecisionTreeClassifier(max_depth=10, min_samples_leaf = 30)

#### Random Forest

In [11]:
model = RandomForestClassifier(criterion='gini', n_estimators = 800, max_depth = 18, n_jobs= -1)

#### LightGBM

In [19]:
model = LGBMClassifier(boosting_type = 'gbdt', learning_rate = 0.01,
                       n_estimators = 1000, objective = 'binary')

#### XGBoost


In [13]:
model = XGBClassifier()

#### SVC

In [32]:
model = SVC()

#### Knn

In [33]:
model = KNeighborsClassifier()

## Parallel Process




In [56]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True)
n_jobs = -1

st = time()

if n_jobs == 1:
    print('Running single process loop')
    res = []

    for train_index, valid_index in tqdm(kfolds.split(X_train, Y_train)):
        res.append(fit_and_score(clone(model), train_index, valid_index, X_train, Y_train))
else:
    print('Running parallel loop')
    parallel = Parallel(n_jobs=n_jobs, verbose=1)
    res = parallel(delayed(fit_and_score)(clone(model), train_idx, valid_idx, X_train, Y_train)
              for train_idx, valid_idx in kfolds.split(X_train, Y_train))


end = time()

print('Total time taken ', (end - st)/60, ' mins')

Running parallel loop


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Total time taken  0.16548996766408283  mins


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished


## Metrics

In [57]:
res = pd.DataFrame(res, columns=['ROC_train', 'ROC_val', 'F1_train', 'F1_val', 'MCC_train', 'MCC_val'])
res

Unnamed: 0,ROC_train,ROC_val,F1_train,F1_val,MCC_train,MCC_val
0,0.712131,0.516491,0.652761,0.498753,0.302083,0.011476
1,0.72185,0.572149,0.67148,0.572127,0.325961,0.136279
2,0.716525,0.588968,0.683973,0.613333,0.308063,0.13721
3,0.713671,0.572429,0.676976,0.585253,0.302709,0.10827
4,0.736141,0.496338,0.706079,0.546667,0.350033,-0.013768


In [58]:
print('Mean roc auc (train, valid) : ', res['ROC_train'].mean(), ', ', res['ROC_val'].mean())

Mean roc auc (train, valid) :  0.7200639508233733 ,  0.5492752333731685


## Training on Entire dataset

In [59]:
model.fit(X_train, Y_train)

DecisionTreeClassifier(max_depth=10, min_samples_leaf=30)

## Model Performance

In [60]:
# model_pipe.roc_auc_score(X_train,Y_train)
Y_train_preds = model.predict_proba(X_train)[:,1]
train_score = roc_auc_score(Y_train,Y_train_preds)

Y_test_preds = model.predict_proba(X_test)[:,1]

print('Training score: ', train_score)
print(classification_report(Y_train, model.predict(X_train)))

Training score:  0.7124645781816231
              precision    recall  f1-score   support

           0       0.64      0.66      0.65       981
           1       0.67      0.65      0.66      1045

    accuracy                           0.65      2026
   macro avg       0.65      0.65      0.65      2026
weighted avg       0.65      0.65      0.65      2026

