In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import lightgbm as lg
from tqdm import tqdm
import gc

from xgboost import XGBRegressor
from sklearn.decomposition import PCA

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer


In [2]:
def BuildData():
    df=pd.read_csv('dataset/train.csv',index_col='Index')
    trainingSet, testSet = train_test_split(df, test_size=0.2,shuffle=True)
    print(len(trainingSet))
    print(len(testSet))
    trainingSet.to_csv("dataset/new_train.csv", sep=',')
    testSet.to_csv("dataset/new_test.csv", sep=',')
    return

In [3]:
def getdf():
    df=pd.read_csv('dataset/new_train.csv',index_col='Index')
    df=df.dropna()
    return df

In [4]:
def encode(df):
    df["Market"] = df["Market"].astype('category')
    df["Day"] = df["Day"].astype('category')
    df["Stock"] = df["Stock"].astype('category')
    return df

In [5]:
def scale(type_,df):
    if type_=='minmax':
        scaler = MinMaxScaler()
    elif type_=='standard':
        scaler = StandardScaler()
    elif type_=='maxabs':
        scaler= MaxAbsScaler()
    elif type_=='robust':
        scaler= RobustScaler()
    elif type_=='QuantileTransformer':
        scaler=QuantileTransformer()
    elif type_=='Normalizer':
        scaler=Normalizer()    
    
    df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']]=scaler.fit_transform(df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']])
    return df

In [6]:
def errorValue(y_pred,y,wt):
    return (sum(wt*((y_pred-y)**2)))

In [7]:
df=getdf()
df=encode(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']  

### Random Forest Regressor Value

In [12]:
X=x
parameters = {'n_estimators':[10,100]}
model = RandomForestRegressor()
grid_RF = GridSearchCV(model,parameters, cv=2,verbose = 2,n_jobs=-1)
grid_RF.fit(X, y)
y_pred=grid_RF.predict(X)
print(errorValue(y_pred,y,wt),5)
print(grid_RF.best_params_)

Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV] n_estimators=10 .................................................
[CV] n_estimators=10 .................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] .................................. n_estimators=10, total=  56.3s
[CV] .................................. n_estimators=10, total=  56.6s
[CV] ................................. n_estimators=100, total= 6.3min
[CV] ................................. n_estimators=100, total= 6.4min


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  6.5min finished


0.20785636145517128 5
{'n_estimators': 100}


In [9]:
X=x
model=RandomForestRegressor(bootstrap=True,min_samples_leaf= 5,max_features= 3,n_estimators=100)
model.fit(X,y)
y_pred=model.predict(X)
print(errorValue(y_pred,y,wt),5)

0.3429572965478771 5


### Feature Importance

In [59]:
# print(X.columns)
imp_list=np.around(np.array(model.feature_importances_),4)
feature_list=sorted(zip(imp_list, X.columns),reverse=True)
print("Features  Values")
for item in feature_list:
    print(item[1],"\t",round(item[0],4))


Features  Values
x4 	 0.1561
x3E 	 0.104
x3D 	 0.0866
x5 	 0.077
x3A 	 0.0714
x1 	 0.0691
x6 	 0.0675
x3C 	 0.0669
x3B 	 0.0652
x2 	 0.0597
Day 	 0.0512
Stock 	 0.0484
x0 	 0.0388
Market 	 0.038


#### Random Forest gave an error of.2404,.2383,.2389 

#### Best Score of .3363

In [12]:
X=x
Y=y
Wt=wt
X_encode=pd.get_dummies(X, prefix=['Market'], columns=['Market'])


In [68]:
parameters = {}
model = XGBRegressor()
grid_XG = GridSearchCV(model,parameters, cv=3,verbose = 2)
grid_XG.fit(X_encode, Y)
y_pred=grid_XG.predict(X_encode)
print(errorValue(y_pred,Y,Wt))

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] learning_rate=0.7 ...............................................
[CV] ................................ learning_rate=0.7, total=  52.4s
[CV] learning_rate=0.7 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   53.3s remaining:    0.0s


[CV] ................................ learning_rate=0.7, total=  50.3s
[CV] learning_rate=0.7 ...............................................
[CV] ................................ learning_rate=0.7, total=  50.2s
[CV] learning_rate=0.2 ...............................................
[CV] ................................ learning_rate=0.2, total=  47.7s
[CV] learning_rate=0.2 ...............................................
[CV] ................................ learning_rate=0.2, total=  48.9s
[CV] learning_rate=0.2 ...............................................
[CV] ................................ learning_rate=0.2, total=  47.9s
[CV] learning_rate=0.05 ..............................................
[CV] ............................... learning_rate=0.05, total=  40.4s
[CV] learning_rate=0.05 ..............................................
[CV] ............................... learning_rate=0.05, total=  40.3s
[CV] learning_rate=0.05 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.1min finished


0.5178516613707286


#### XGBoost Gave an error of .010467 , (.5220 with hyper parameter tuning),.5482, .5178

In [None]:
# X=x[100000:200000]
# X_encode=pd.get_dummies(X, prefix=['Market','Stock','Day'], columns=['Market','Stock','Day'])
# Y=y[100000:200000]
# Wt=wt[100000:200000]
# y_pred=grid.predict(X_encode)
# print(errorValue(y_pred,Y,Wt))


In [None]:
# X_df=X_encode[X_encode.columns[range(11,3295)]]

In [None]:
pca = PCA(n_components=0.95,whiten=True)
X_pca = pca.fit_transform(X_encode)
print('Original number of features:', X_encode.shape[1])
print('Reduced number of features:', X_pca.shape[1])

### So PCA didn't Work due to Memmory Error. I guess the idea to split categorical value into columns wont work in this system as dataset gets huge 

In [47]:
df=getdf()
# df=encode(df)
# df=scale('standard',df)
# wt=df['Weight']
# x=df.drop(['Weight','y'],axis=1)    
# y=df['y']

In [48]:
def feature_engineering(data_frame):

    # creating new features
    data_frame['new']  = data_frame['x3B'] - data_frame['x5']
    data_frame['new2']  = data_frame['x3C'] - data_frame['x4']
    
    # scalling up "small" features
    small_features_1 = ['x0','x2',"x4"]
    small_features_2 = ["x3A",'x1', "x3B", "x3C", "x3D", "x3E", "x5", "new", "new2"]
    data_frame[small_features_1]= data_frame[small_features_1]*1000
    data_frame[small_features_2]= data_frame[small_features_2]*100000

In [49]:
feature_engineering(df)
X_train = df.drop(['y','Weight'],1)
Y = df.y

In [56]:
# load data into train lightgbm dataset
# notice I'm scaling up the target, making first two columns as categorical features, and load weights
train = lg.Dataset(X_train, Y, categorical_feature=[0, 1], weight=df.Weight, free_raw_data=False)

# hyperparameters for the model
parameters = {'learning_rate': '0.9'}

boosts = 200
num_ensembles = 5
y_pred = 0.0

# average 5 different models 
for i in tqdm(range(num_ensembles)):
    model = lg.train(parameters, train_set=train, num_boost_round=boosts + i + 2) 
    y_pred +=  model.predict(data=X_train)
y_pred /= num_ensembles
gc.collect()

# 'num_leaves': 526, 
#  'max_bin': 650, 'feature_fraction': '0.450', 
#  'learning_rate': '0.009', 'reg_lambda': 3, 'bagging_freq': 2,
#  'min_data_in_leaf': 142, 'colsample_bytree': '0.670', 
#  'metric': 'rmse', 'verbose': 1

100%|██████████| 5/5 [01:30<00:00, 18.01s/it]


43

In [57]:
print(errorValue(y_pred,Y,wt))

0.260838974997406


#### Light GBM is giving an error Value of .38226, .3843, .5292(tuning), .2608(tuning -learning rate .9)

## Light GBM Version -Jithin

In [70]:
df=getdf()
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']  

In [75]:
train = lg.Dataset(x, y, categorical_feature=[0, 1, 2], weight=df.Weight, free_raw_data=False)
parameters = {'learning_rate': '0.9'}

model_lgb = lg.train(params=parameters,train_set=train, num_boost_round=300)
y_pred = model_lgb.predict(data=x)



In [76]:
print(errorValue(y_pred,Y,wt))

0.1509990547939315


#### on First Run - Value is .3134
#### on Second Run - Value is .15099

# Testing on the TEST Data now. 

In [82]:
def getdf_test():
    df=pd.read_csv('dataset/new_test.csv',index_col='Index')
    df=df.dropna()
    return df

In [83]:
df=getdf_test()
df=encode(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fe65fe7db00>>
Traceback (most recent call last):
  File "/home/jithin/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7fe66544bc18>>
Traceback (most recent call last):
  File "/home/jithin/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [84]:
## Applying Random Forest Model on Test Data

X=x
y_pred=grid_RF.predict(X)
print(errorValue(y_pred,y,wt),5)

0.17512580135567665 5


In [86]:
## Applying XG Boost Model on Test Data

X=x
Y=y
Wt=wt

X_encode=pd.get_dummies(X, prefix=['Market'], columns=['Market'])
y_pred=grid_XG.predict(X_encode)
print(errorValue(y_pred,Y,Wt))

4.1323489853611735


In [88]:
## Applying LightGBM model on Test Data

df=getdf()
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y'] 

y_pred = model_lgb.predict(data=x)
print(errorValue(y_pred,y,wt))


0.1509990547939315


### Hence Result :- Light GBM model came up with the least Error Value of .15 which is great for the test data. 

# Inference and learning from this Project

The Project was based on a challenge which was previously attempted by the best minds in the industry with a considerable price money. This project introduced me to the concept of Boosting in particular the XG Boost and Light GBM. I got the least error by applying the Light GBM model while gradient boosting continued to give a very high error the test data. 

One challenge i faced was when i tried to  use get dummies to encode the categorical data of Market, Stock and Day which actualy breaks my memory. So i had to stop working on it and my idea. 

Further improvement. 

1) Categorizing the stocks (creating a unsupervised learning model and get some clusters of stock like (High Performing Stock, Low Performing stock etc) and then use this as one of the feature.

2) Feature X4 and a combination of X4 with other variables as new feature (This method was used by one of the top 3 candidates for their model)


