# Load Libraries

In [54]:
import pandas as pd
import numpy as np
import random as rnd
import datetime

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_style('whitegrid')
%matplotlib inline

# plotly
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import cufflinks as cf
cf.go_offline()

from fastai.tabular.all import *

# SKlearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from lightgbm.sklearn import LGBMClassifier
from time import time


from tqdm import tqdm

# Load Data

In [55]:
filepath = "/Users/Kapil/PycharmProjects/stock-price"
train = pd.read_csv(filepath + "/data/train.csv")
test = pd.read_csv(filepath + "/data/test.csv")

In [56]:
SEED = 13
np.random.seed(SEED)

In [57]:
Y_train = train['Up']
X_train = train.drop(['Up'], axis = 1)

X_test = test

### Preprocessing Function

In [58]:
def preproc(df):
    #df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
    df.index = df['Date']
    add_datepart(df, 'Date')
    df.drop('Elapsed', axis=1, inplace=True)
    df.replace({False: 0, True: 1}, inplace=True)

    return  df

In [59]:
X_train = preproc(X_train)
X_test = preproc(X_test)

In [60]:
X_train.head()


Unnamed: 0_level_0,Open,High,Low,Close,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-02,50.85,50.91,50.62,50.72,2004,1,1,2,4,2,0,0,0,0,0,0
2004-01-05,50.75,50.84,50.61,50.79,2004,1,2,5,0,5,0,0,0,0,0,0
2004-01-06,50.87,51.26,50.86,51.15,2004,1,2,6,1,6,0,0,0,0,0,0
2004-01-07,51.45,51.54,51.29,51.51,2004,1,2,7,2,7,0,0,0,0,0,0
2004-01-08,51.15,51.33,51.02,51.28,2004,1,2,8,3,8,0,0,0,0,0,0


# Model


### LightGBM

In [61]:
model = LGBMClassifier(boosting_type = 'gbdt', learning_rate = 0.01,
                       n_estimators = 800, objective = 'binary')

In [62]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True)
n_jobs = -1

### Parallel version

In [63]:
from joblib import Parallel, delayed
from sklearn.base import clone

def fit_and_score(model, train_index, valid_index, X, Y):
    """
    function to use in a joblib parallel env, takes the inputs and returns metrics
    return: (roc_train, roc_val, f1_train, f1_val)
    """
    # get X and y vectors
    x_train, x_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = Y.iloc[train_index], Y.iloc[valid_index]

    # over sample x
    #x_train, y_train = ros.fit_resample(x_train, y_train)

    # preprocess
#     x_train = ct.fit_transform(x_train)
#     x_valid = ct.transform(x_valid)

    # train model
    model.fit(x_train, y_train)

    # append scores
    y_train_preds = model.predict(x_train)
    y_valid_preds = model.predict(x_valid)

    f1_train = f1_score(y_train, y_train_preds)
    roc_train = roc_auc_score(y_train, model.predict_proba(x_train)[:,1])
    mcc_train = matthews_corrcoef(y_train, y_train_preds)

    f1_val = f1_score(y_valid, y_valid_preds)
    roc_val = roc_auc_score(y_valid, model.predict_proba(x_valid)[:,1])
    mcc_val = matthews_corrcoef(y_valid, y_valid_preds)

    log_line = 'Train score: ' + str(roc_train) + ' Validation score: ' + str(roc_val)
    tqdm.write(log_line)

    return roc_train, roc_val, f1_train, f1_val, mcc_train, mcc_val


In [64]:
st = time()

if n_jobs == 1:
    print('Running single process loop')
    res = []

    for train_index, valid_index in tqdm(kfolds.split(X_train, Y_train)):
        res.append(fit_and_score(clone(model), train_index, valid_index, X_train, Y_train))
else:
    print('Running parallel loop')
    parallel = Parallel(n_jobs=n_jobs, verbose=1)
    res = parallel(delayed(fit_and_score)(clone(model), train_idx, valid_idx, X_train, Y_train)
              for train_idx, valid_idx in kfolds.split(X_train, Y_train))


end = time()

print('Total time taken ', (end - st)/60, ' mins')

Running parallel loop


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Total time taken  0.2537138859430949  mins


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.2s finished


In [65]:
res = pd.DataFrame(res, columns=['ROC_train', 'ROC_val', 'F1_train', 'F1_val', 'MCC_train', 'MCC_val'])
res

Unnamed: 0,ROC_train,ROC_val,F1_train,F1_val,MCC_train,MCC_val
0,0.955781,0.573968,0.891278,0.590698,0.775061,0.130957
1,0.95307,0.564276,0.878913,0.571429,0.746864,0.094164
2,0.963583,0.490187,0.903073,0.494005,0.797541,-0.042885
3,0.956752,0.581071,0.892963,0.577367,0.776509,0.093464
4,0.961814,0.570696,0.915335,0.580796,0.823456,0.113985


In [66]:
model.fit(X_train, Y_train)


LGBMClassifier(learning_rate=0.01, n_estimators=800, objective='binary')

In [154]:
# model_pipe.roc_auc_score(X_train,Y_train)
Y_train_preds = model.predict_proba(X_train)[:,1]
train_score = roc_auc_score(Y_train,Y_train_preds)

Y_test_preds = model.predict_proba(X_test)[:,1]
#test_score = roc_auc_score(Y_test,Y_test_preds)

print('Training score: ', train_score)
#print('Testing score: ', test_score)

Training score:  0.941161494227646


In [155]:
Y_test_preds.shape

(1200,)

In [None]:
def submitformat(df, test_df):
    df[df > 0.5] = 1
    df[df <= 0.5] = 0
    df = pd.DataFrame(df)
    df.index = test_df.index
    df = df.iloc[5::6, :]
    df = df.astype(int)
    return df


In [156]:
Y_test_preds[Y_test_preds > 0.5] = 1
Y_test_preds[Y_test_preds <= 0.5] = 0

In [157]:
Y_test_preds = pd.DataFrame(Y_test_preds)

In [158]:
Y_test_preds.shape

(1200, 1)

In [159]:
Y_test_preds.index = test.index

In [160]:
Y_test_preds.head()

Unnamed: 0_level_0,0
Date,Unnamed: 1_level_1
2012-01-19,0.0
2012-01-20,0.0
2012-01-23,0.0
2012-01-24,0.0
2012-01-25,0.0


In [161]:
Y_test_preds = Y_test_preds.iloc[5::6, :]

In [162]:
Y_test_preds.shape

(200, 1)

In [163]:
Y_test_preds.head()

Unnamed: 0_level_0,0
Date,Unnamed: 1_level_1
2012-01-26,0.0
2012-02-09,1.0
2012-02-24,0.0
2012-03-09,1.0
2012-03-23,0.0


In [164]:
Y_test_preds = Y_test_preds.astype(int)

In [165]:
Y_test_preds.to_csv('/Users/Kapil/Desktop/Y_test.csv')
