## Load Libraries

In [6]:
import pandas as pd
import numpy as np
import random as rnd
import datetime

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_style('whitegrid')
%matplotlib inline

# plotly
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cufflinks as cf
cf.go_offline()

from fastai.tabular.all import *

# SKlearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

from time import time


from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone

from tune_sklearn import TuneSearchCV
from tabulate import tabulate

import joblib

## Load Data

In [7]:
filepath = "/Users/Kapil/PycharmProjects/stock-price"
train = pd.read_csv(filepath + "/data/train.csv")
test = pd.read_csv(filepath + "/data/test.csv")

In [8]:
SEED = 13
np.random.seed(SEED)

In [9]:
Y_trainW = train['Up']
X_trainW = train.drop(['Up'], axis = 1)

## Functions
#### Preprocessing function

In [10]:
def LagPreproc(df):
    data = pd.DataFrame()
    data['Today'] = df['Close'].pct_change() * 100
    #data = data.rename("Today")
    for i in range(1,6):
        data['Lag'+str(i)] = data['Today'].shift(i)
    data = data.fillna(data.mean())
    return data

#### Specificity function

In [11]:
def specificity(y_truth, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_truth, y_pred).ravel()

    specificity = tn / (tn+fp)
    return specificity


#### Submission format function

In [13]:
def submitformat(df, test_df):
    df[df > 0.5] = 1
    df[df <= 0.5] = 0
    df = pd.DataFrame(df)
    df.index = test_df.index
    df = df.iloc[5::6, :]
    df = df.astype(int)
    return df

### Preprocess train and test

In [14]:
X_trainW  = LagPreproc(X_trainW)
X_test = LagPreproc(test)

In [None]:
### Validation data split

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X_trainW, Y_trainW,
                                                  test_size=0.2, random_state=SEED)

In [15]:
X_train.head()

Unnamed: 0,Today,Lag1,Lag2,Lag3,Lag4,Lag5
0,0.043343,0.043086,0.042972,0.043069,0.04265,0.042789
1,0.138013,0.043086,0.042972,0.043069,0.04265,0.042789
2,0.708801,0.138013,0.042972,0.043069,0.04265,0.042789
3,0.703812,0.708801,0.138013,0.043069,0.04265,0.042789
4,-0.446515,0.703812,0.708801,0.138013,0.04265,0.042789


## Models
#### Logistic Regression Tuning

In [54]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  solver=solver)

model = GridSearchCV(LogisticRegression(solver='liblinear'),
                    param_grid=param_grid,
                    scoring='accuracy',
                    verbose=1,
                    n_jobs=-1)
#grid_result = grid.fit(X_train, Y_train)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

#### Decision Tree tuning

In [None]:
config = {
    'min_samples_leaf': [15, 19, 23, 25, 28, 30, 35, 40],
    "max_depth": [5, 8, 10, 12]
}
model = TuneSearchCV(DecisionTreeClassifier(random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=-1, cv =5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)

#### Random Forest Tuning

In [11]:
config = {
    "n_estimators": [300, 400, 600, 650, 700, 750, 800, 1000],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5],
    "max_depth": [9, 12] ,
    #15, 18, 21, 24],
    "max_features": ['auto', 'sqrt']
}


model = TuneSearchCV(RandomForestClassifier(n_jobs= 1, verbose = 1,
                                                  random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=1, cv=5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)


#### LightGBM Tuning

In [19]:
config = {
    "n_estimators": [400, 600, 700, 800, 1000],
    "colsample_bytree": [0.7, 0.8],
    "max_depth": [5, 10, 15, 20, 25],
    "num_leaves": [50, 100, 200],
    "reg_lambda": [1.1, 1.2, 1.3],
    "min_split_gain": [0.3, 0.4],
    "subsample": [0.7, 0.8, 0.9],
    "learning_rate": [0.05, 0.1]
}
model = TuneSearchCV(LGBMClassifier(n_jobs=-1, random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=-1, cv=5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)

#### XGBoost Tuning

In [16]:
config = {
    "n_estimators": [100, 200, 300, 400, 600, 700, 800],
    "colsample_bytree": [0.7, 0.8],
    "min_child_weight": [1,3,5,7] ,
    "max_depth": [3, 6, 9, 12],
    "learning_rate": [0.05, 0.1]
}

model = TuneSearchCV(XGBClassifier(n_jobs=-1, random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=-1, cv=5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)

NameError: name 'TuneSearchCV' is not defined

#### SVC Tuning

In [32]:
config = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}

model = TuneSearchCV(SVC(random_state=SEED),
                          config,
                          scoring='accuracy',
                          n_jobs=-1, cv = 5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)

#### Knn Tuning

In [33]:
config = {'leaf_size' : list(range(1,100)),
          'n_neighbors' : list(range(1,100))
          'p':[1,2]
}

model = TuneSearchCV(KNeighborsClassifier(n_jobs=-1),
                          config,
                          scoring='accuracy',
                          n_jobs=1, cv = 5,
                          search_optimization='bayesian',
                          verbose=1,
                          random_state=SEED)


## Training on the dataset

In [24]:
model.fit(X_train, Y_train)

LGBMClassifier(colsample_bytree=0.718109403719446,
               learning_rate=0.06791886573990238, max_depth=5,
               min_split_gain=0.3039987680281693, n_estimators=600,
               num_leaves=100, reg_lambda=1.1, subsample=0.9)

## Model Performance on Val

In [25]:
# model_pipe.roc_auc_score(X_train,Y_train)
Y_train_preds = model.predict_proba(X_train)[:,1]
Y_train_preds = np.round(Y_train_preds).astype(int)
train_score = accuracy_score(Y_train,Y_train_preds)

Y_val_preds = model.predict_proba(X_val)[:,1]
Y_val_preds = np.round(Y_val_preds).astype(int)
val_score = accuracy_score(Y_val,Y_val_preds)

print('Training score: ', train_score)
print('Testing score: ', test_score)

print(classification_report(Y_val, model.predict(X_val)))


Training score:  0.8267522211253702
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       981
           1       0.82      0.84      0.83      1045

    accuracy                           0.83      2026
   macro avg       0.83      0.83      0.83      2026
weighted avg       0.83      0.83      0.83      2026



## Train on entire dataset

In [None]:
model.fit(X_trainW, Y_trainW)

## Generate submission file

In [None]:
Y_test_preds = model.predict_proba(X_test)[:,1]
Y_test = submitformat(Y_test_preds, X_test)

Y_test.to_csv('/Users/Kapil/Desktop/Y_test.csv')

### Save Model

In [None]:
save_path = '/Users/Kapil/PycharmProjects/stock-price/models/'
model_name = model.__class__.__name__ + '.joblib'
joblib.dump(model, save_path + model_name)

