In [98]:
import pandas as pd
import numpy as np
from scipy.stats import randint as sp_randint
from datetime import timedelta

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import lightgbm as lg

from xgboost import XGBRegressor
from sklearn.decomposition import PCA

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer

In [99]:
def BuildData():
    df=pd.read_csv('dataset/train.csv',index_col='Index')
    trainingSet, testSet = train_test_split(df, test_size=0.2,shuffle=True)
    print(len(trainingSet))
    print(len(testSet))
    trainingSet.to_csv("dataset/new_train.csv", sep=',')
    testSet.to_csv("dataset/new_test.csv", sep=',')
    return

In [100]:
def getdf():
    df=pd.read_csv('dataset/new_train.csv',index_col='Index')
    df=df.dropna()
    return df

In [101]:
def encode(df):
    df["Market"] = df["Market"].astype('category')
    df["Day"] = df["Day"].astype('category')
    df["Stock"] = df["Stock"].astype('category')
    return df

In [102]:
def scale(type_,df):
    if type_=='minmax':
        scaler = MinMaxScaler()
    elif type_=='standard':
        scaler = StandardScaler()
    elif type_=='maxabs':
        scaler= MaxAbsScaler()
    elif type_=='robust':
        scaler= RobustScaler()
    elif type_=='QuantileTransformer':
        scaler=QuantileTransformer()
    elif type_=='Normalizer':
        scaler=Normalizer()    
    
    df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']]=scaler.fit_transform(df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']])
    return df

In [103]:
def errorValue(y_pred,y,wt):
    return (sum(wt*((y_pred-y)**2)))

In [104]:
def addfeatures(df): 
    df['Date'] = pd.to_datetime("'2015-10-01'") # assumption
    df['Date'] = df['Date'] + df['Day'].map(timedelta) - timedelta(days=1)
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.weekofyear
    df['Weekday'] = df['Date'].dt.weekday
    df['DayMonth'] = df['Date'].dt.day
    df=df.drop(['Date'],axis=1)
    return df

In [113]:
df=getdf()
# df=addfeatures(df)
df=encode(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']

### Random Forest Model

In [17]:
X=x
param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 17),
              "min_samples_split": sp_randint(2, 17),
              "min_samples_leaf": sp_randint(1, 17),
              "bootstrap": [True]}
n_iter_search = 20
model = RandomForestRegressor()
RF = RandomizedSearchCV(model,param_distributions=param_dist,n_iter=n_iter_search,cv=4,verbose=2)
RF.fit(X, y)
y_pred=RF.predict(X)
print(errorValue(y_pred,y,wt),5)
print(RF.best_params_)

Fitting 4 folds for each of 20 candidates, totalling 80 fits
[CV] bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6 
[CV]  bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6, total=  29.9s
[CV] bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.0s remaining:    0.0s


[CV]  bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6, total=  26.8s
[CV] bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6 
[CV]  bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6, total=  27.4s
[CV] bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6 
[CV]  bootstrap=True, max_depth=None, max_features=8, min_samples_leaf=7, min_samples_split=6, total=  28.6s
[CV] bootstrap=True, max_depth=None, max_features=15, min_samples_leaf=3, min_samples_split=16 
[CV]  bootstrap=True, max_depth=None, max_features=15, min_samples_leaf=3, min_samples_split=16, total=  49.0s
[CV] bootstrap=True, max_depth=None, max_features=15, min_samples_leaf=3, min_samples_split=16 
[CV]  bootstrap=True, max_depth=None, max_features=15, min_samples_leaf=3, min_samples_split=16, total=  48.4s
[CV] bootstrap=True, max_depth=None, max_features=15, min_samples_leaf=

[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=14, min_samples_split=3, total=   7.1s
[CV] bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=14, min_samples_split=3 
[CV]  bootstrap=True, max_depth=3, max_features=5, min_samples_leaf=14, min_samples_split=3, total=   6.3s
[CV] bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10 
[CV]  bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10, total=  15.6s
[CV] bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10 
[CV]  bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10, total=  15.3s
[CV] bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10 
[CV]  bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10, total=  15.2s
[CV] bootstrap=True, max_depth=3, max_features=14, min_samples_leaf=7, min_samples_split=10

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed: 27.2min finished


0.4204971306673579 5
{'bootstrap': True, 'max_depth': None, 'max_features': 12, 'min_samples_leaf': 12, 'min_samples_split': 9}


In [114]:
X=x
model_RF = RandomForestRegressor(bootstrap= True,max_depth= None, max_features=12, min_samples_leaf=12, min_samples_split= 9)
model_RF.fit(X,y)
y_pred=model_RF.predict(X)
print(errorValue(y_pred,y,wt),5)

0.42165984777146154 5


In [21]:
imp_list=np.around(np.array(model.feature_importances_),4)
feature_list=sorted(zip(imp_list, X.columns),reverse=True)
print("Features       Values")
for item in feature_list:
    print(item[1],"\t\t",round(item[0],4))

Features       Values
x4 		 0.1856
x3E 		 0.1313
x3D 		 0.075
x5 		 0.0725
x6 		 0.0682
x3A 		 0.0617
x3C 		 0.0534
x3B 		 0.0484
x1 		 0.0465
Market 		 0.0436
x0 		 0.042
x2 		 0.0413
Day 		 0.0361
Stock 		 0.0334
DayMonth 		 0.0222
Week 		 0.0202
Weekday 		 0.0098
Month 		 0.0072
Year 		 0.0017


### XGBoost Model

In [116]:
df=getdf()
# df=addfeatures(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']

In [36]:
X=x
param_dist = {'learning_rate':[.1,.01,.001],'max_depth':[2,7,11]}
model = XGBRegressor(booster='dart')
grid_XG = GridSearchCV(model,param_dist, cv=3,verbose = 2)
grid_XG.fit(X, y)
y_pred=grid_XG.predict(X)
print(errorValue(y_pred,y,wt))
print(grid_XG.best_params_)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f140adece80>>
Traceback (most recent call last):
  File "/home/jithin/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] learning_rate=0.1, max_depth=2 ..................................
[CV] ................... learning_rate=0.1, max_depth=2, total=  52.7s
[CV] learning_rate=0.1, max_depth=2 ..................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.3s remaining:    0.0s


[CV] ................... learning_rate=0.1, max_depth=2, total=  55.7s
[CV] learning_rate=0.1, max_depth=2 ..................................
[CV] ................... learning_rate=0.1, max_depth=2, total=  52.7s
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] ................... learning_rate=0.1, max_depth=7, total= 2.7min
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] ................... learning_rate=0.1, max_depth=7, total= 2.6min
[CV] learning_rate=0.1, max_depth=7 ..................................
[CV] ................... learning_rate=0.1, max_depth=7, total= 2.7min
[CV] learning_rate=0.1, max_depth=11 .................................
[CV] .................. learning_rate=0.1, max_depth=11, total= 4.2min
[CV] learning_rate=0.1, max_depth=11 .................................
[CV] .................. learning_rate=0.1, max_depth=11, total= 4.1min
[CV] learning_rate=0.1, max_depth=11 .................................
[CV] .

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 30.9min finished


0.45000201694990305
{'learning_rate': 0.1, 'max_depth': 7}


In [117]:
X=x
model_XGB = XGBRegressor(booster='dart',learning_rate= 0.1, max_depth= 7)
model_XGB.fit(X,y)
y_pred=model_XGB.predict(X)
print(errorValue(y_pred,y,wt),5)

0.4551206270632664 5


### Light GBM 

In [119]:
df=getdf()
# df=addfeatures(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']

In [121]:
train = lg.Dataset(x, y, categorical_feature=[0,1,2], weight=df.Weight, free_raw_data=False)

parameters={'boosting_type':'gbdt','learning_rate':.05}
boosts = 200
num_ensembles = 5
y_pred = 0.0

for i in range(num_ensembles):
    model = lg.train(params=parameters,train_set=train,num_boost_round=boosts ) 
    y_pred +=  model.predict(data=x)
y_pred /= num_ensembles

# model_lgbm = lg.train(train_set=train,params=)
# y_pred = model_lgbm.predict(data=x)
print(errorValue(y_pred,y,wt))



0.3962025107608086


# Inference and learning from this Project

The Project was based on a challenge which was previously attempted by the best minds in the industry with a considerable price money. This project introduced me to the concept of Boosting in particular the XG Boost and Light GBM. I got the least error by applying the Light GBM model while gradient boosting continued to give a very high error the test data. 

One challenge i faced was when i tried to  use get dummies to encode the categorical data of Market, Stock and Day which actualy breaks my memory. So i had to stop working on it and my idea. 

Further improvement. 

1) Categorizing the stocks (creating a unsupervised learning model and get some clusters of stock like (High Performing Stock, Low Performing stock etc) and then use this as one of the feature.

2) Feature X4 and a combination of X4 with other variables as new feature (This method was used by one of the top 3 candidates for their model)

# Testing with Test Data

In [107]:
def getdf_test():
    df=pd.read_csv('dataset/new_test.csv',index_col='Index')
    df=df.dropna()
    return df

In [111]:
df=getdf_test()
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']

In [115]:
X=x
y_pred=model_RF.predict(X)
print(errorValue(y_pred,y,wt))

0.42165984777146154


In [118]:
X=x
Y=y
Wt=wt

y_pred=model_XGB.predict(X)
print(errorValue(y_pred,Y,Wt))

0.4551206270632664


In [123]:
df=getdf()
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y'] 

y_pred = model.predict(data=x)
print(errorValue(y_pred,y,wt))

0.3962025107608086
