# Load Libraries

In [1]:
import pandas as pd
import numpy as np
import random as rnd
import datetime

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
#sns.set_style('whitegrid')
%matplotlib inline

# plotly
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.ensemble import RandomForestClassifier
#for normalizing data
from sklearn.preprocessing import MinMaxScaler

import cufflinks as cf
cf.go_offline()

from fastai.tabular.all import *

# SKlearn# SKlearn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV

from time import time


from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone

#from tune_sklearn import TuneSearchCV
#from tabulate import tabulate

import joblib


from tqdm import tqdm

# Load Data

In [60]:
filepath = "/Users/Kapil/PycharmProjects/stock-price"
train = pd.read_csv(filepath + "/data/train.csv")
test = pd.read_csv(filepath + "/data/test.csv")

In [61]:
SEED = 13
np.random.seed(SEED)

In [62]:
Y_trainW = train['Up']
X_trainW = train.drop(['Up'], axis = 1)

X_test = test


### Create Validation Dataset

In [63]:
#Last 20% of the train data as Validation data
X_val = X_trainW[int(0.7*len(X_trainW)):]
Y_val = Y_trainW[int(0.7*len(X_trainW)):]

X_train = X_trainW[:int(0.7*len(X_trainW))]
Y_train = Y_trainW[:int(0.7*len(X_trainW))]

## Functions
#### Preprocessing function

In [64]:
def preproc(df):
    #df['Date'] = pd.to_datetime(df.Date,format='%Y-%m-%d')
    df_colRef = df
    scaler = MinMaxScaler(feature_range=(0, 1))
    df.index = df['Date']
    add_datepart(df, 'Date')
    df.drop('Elapsed', axis=1, inplace=True)
    df.replace({False: 0, True: 1}, inplace=True)
    df = scaler.fit_transform(df)
    df = pd.DataFrame(df, columns = df_colRef.columns)

    return  df

#### Submission format function

In [65]:
def submitformat(df, test_df):
    df[df > 0.5] = 1
    df[df <= 0.5] = 0
    df = pd.DataFrame(df)
    df.index = test_df.index
    df = df.iloc[5::6, :]
    df = df.astype(int)
    return df

### Preprocess Data

In [66]:
X_trainW = preproc(X_trainW)
X_train = preproc(X_train)
X_val = preproc(X_val)
X_test = preproc(X_test)

In [67]:
X_train.shape

(1418, 16)

In [68]:
X_trainW.shape

(2026, 16)

In [69]:
X_train.head()

Unnamed: 0,Open,High,Low,Close,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
0,0.012353,0.007363,0.006505,0.003186,0.0,0.0,0.0,0.033333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.009412,0.005381,0.006209,0.005213,0.0,0.0,0.019231,0.133333,0.0,0.008242,0.0,0.0,0.0,0.0,0.0,0.0
2,0.012941,0.017276,0.013601,0.015639,0.0,0.0,0.019231,0.166667,0.25,0.010989,0.0,0.0,0.0,0.0,0.0,0.0
3,0.03,0.025205,0.026316,0.026064,0.0,0.0,0.019231,0.2,0.5,0.013736,0.0,0.0,0.0,0.0,0.0,0.0
4,0.021176,0.019258,0.018332,0.019403,0.0,0.0,0.019231,0.233333,0.75,0.016484,0.0,0.0,0.0,0.0,0.0,0.0


## Models
#### Logistic Regression Tuning

In [None]:
penalty = ['l1', 'l2']
C = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty,
                  C=C,
                  solver=solver)

model1 = GridSearchCV(LogisticRegression(solver='liblinear'),
                    param_grid=param_grid,
                    scoring='accuracy',
                    verbose=1,
                    n_jobs=-1)



## Training on the dataset

In [None]:
model = model1

In [None]:
model.fit(X_train, Y_train)

## Model Performance on Val

In [None]:
# model_pipe.roc_auc_score(X_train,Y_train)
Y_train_preds = model.predict_proba(X_train)[:,1]
Y_train_preds = np.round(Y_train_preds).astype(int)
train_score = accuracy_score(Y_train,Y_train_preds)

Y_val_preds = model.predict_proba(X_val)[:,1]
Y_val_preds = np.round(Y_val_preds).astype(int)
val_score = accuracy_score(Y_val,Y_val_preds)

print('Training score: ', train_score)
print('Testing score: ', val_score)

print(classification_report(Y_val, model.predict(X_val)))





### Plotting Coeff

In [None]:
Coef = abs(model.best_estimator_.coef_[0])
ColumnNames = X_train.columns
lg_df = pd.DataFrame(Coef, ColumnNames, columns = ["Coef"])
lg_df_sorted = lg_df.sort_values('Coef',ascending=False)
lg_df_sorted['ColumnNames'] = lg_df_sorted.index
lg_df_sorted['Coef'] = lg_df_sorted['Coef'].divide(max(lg_df_sorted['Coef']))

plt.bar('ColumnNames', 'Coef', data = lg_df_sorted)
plt.xticks(rotation=90)
plt.show()

## Train on entire dataset

In [None]:
model.fit(X_trainW, Y_trainW)

## Generate submission file

In [None]:
Y_test_preds = model.predict_proba(X_test)[:,1]
Y_test = submitformat(Y_test_preds, test)

### Save Model

In [None]:
save_path = '/Users/Kapil/PycharmProjects/stock-price/models/non-lag/'
model_name = model.__class__.__name__ + '.joblib'
joblib.dump(model, save_path + model_name)


In [165]:
Y_test_preds.to_csv('/Users/Kapil/Desktop/Y_test.csv')
