In [1]:
import pandas as pd
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,explained_variance_score,mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder


#Scaling 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer



In [115]:
def BuildData():
    df=pd.read_csv('dataset/train.csv',index_col='Index')
    trainingSet, testSet = train_test_split(df, test_size=0.2,shuffle=True)
    print(len(trainingSet))
    print(len(testSet))
    trainingSet.to_csv("dataset/new_train.csv", sep=',')
    testSet.to_csv("dataset/new_test.csv", sep=',')
    return

499053
124764


In [2]:
def getxy():
    df=pd.read_csv('dataset/new_train.csv',index_col='Index')
    df=df.dropna()
#     x_train, x_test, y_train, y_test = train_test_split(df.drop(['y',],axis=1),df['y'],test_size=0.20)
#     x_train_weight=x_train['Weight']
#     x_test_weight=x_test['Weight']
#     x_train=x_train.drop(['Weight'], axis=1)
#     x_test=x_test.drop(['Weight'], axis=1)
#     return df,x_train,x_train_weight,x_test,x_test_weight,y_train,y_test
    return df

In [3]:
def scaling(type_,df):
    if type_=='minmax':
        scaler = MinMaxScaler()
    elif type_=='standard':
        scaler = StandardScaler()
    elif type_=='maxabs':
        scaler= MaxAbsScaler()
    elif type_=='robust':
        scaler= RobustScaler()
    elif type_=='QuantileTransformer':
        scaler=QuantileTransformer()
    elif type_=='Normalizer':
        scaler=Normalizer()    
#     df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']]=scaling('standard',df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']])
    df=scaler.fit_transform(df)
    return df

In [5]:
df, X_train,x_train_weight,X_test,x_test_weight,y_train,y_test=getxy()
x_train=scaling('minmax',X_train)
x_test=scaling('minmax',X_test)

In [40]:
mylist=[LinearRegression(),Ridge(),Lasso(),KNeighborsRegressor(),SVR(),DecisionTreeRegressor(),RandomForestRegressor()]
nameList=["Linear Regression","Ridge Linear Regression","Lasso Regression","KNeighborsRegressor","Support Vector Regression","Decision Tree Regression","Random Forest Regression"]
count=0
for item in mylist:
    eqn=item
    eqn.fit(x_train,y_train)
    y_pred_train = eqn.predict(x_train)
    y_pred_test = eqn.predict(x_test)
    rmse_train = round(mean_absolute_error(y_train,y_pred_train,sample_weight=x_train_weight),10)
    rmse_test = round(mean_absolute_error(y_test,y_pred_test,sample_weight=x_test_weight),10)

    print("***",nameList[count],"***")
    print("RMSE for training data:", rmse_train)
    print("RMSE for test data:", rmse_test)
    print("R2 Score for training:",r2_score(y_train, y_pred_train))
    print("R2 Score for test:",r2_score(y_test, y_pred_test))
    print("Model Score :",eqn.score(x_train,y_train))
    count+=1
    print()

*** Linear Regression ***
RMSE for training data: 0.0001344566
RMSE for test data: 0.0001382038
R2 Score for training: 0.0446625832837
R2 Score for test: 0.031456376233
Model Score : 0.0446625832837

*** Ridge Linear Regression ***
RMSE for training data: 0.0001375901
RMSE for test data: 0.0001403041
R2 Score for training: 0.0378470413338
R2 Score for test: 0.0551600862689
Model Score : 0.0378470413338

*** Lasso Regression ***
RMSE for training data: 0.0001522074
RMSE for test data: 0.0001529027
R2 Score for training: 0.0
R2 Score for test: -7.45191013318e-06
Model Score : 0.0

*** KNeighborsRegressor ***
RMSE for training data: 0.0001052868
RMSE for test data: 0.0001336143
R2 Score for training: 0.317844250004
R2 Score for test: -0.0281755442727
Model Score : 0.317844250004

*** Support Vector Regression ***
RMSE for training data: 0.0022587315
RMSE for test data: 0.0022588817
R2 Score for training: -4.80238311513
R2 Score for test: -4.99624264647
Model Score : -4.80238311513

*** De

### Cross Validation Model

In [None]:
df,x_train,x_train_weight,x_test,x_test_weight,y_train,y_test=getxy()
x_train=scaling('minmax',x_train)
x_test=scaling('minmax',x_test)

In [46]:
clf = RandomForestRegressor()
scores = cross_validate(clf, x_train, y_train, cv=10,scoring=('r2','neg_mean_absolute_error','neg_explained_variance_score'))
print("MAE Train :",scores['train_neg_explained_variance_score'].mean())
print("MAE Test :",scores['test_neg_explained_variance_score'].mean())
print("R2 Train:",scores['train_r2'].mean())
print("R2 Test:",scores['test_r2'].mean())

RMSE Train : -0.000246577947702
RMSE Test : -0.000398567224802
R2 Train: 0.785524022607
R2 Test: 0.0954744606073


### Checking Various Scales and identify the best scaling for this problem

In [8]:
scaleList='minmax standard maxabs robust QuantileTransformer Normalizer'.split()
for item in scaleList:
    x_train=scaling(item,X_train)
    x_test=scaling(item,X_test)
    
    model = LinearRegression()
    scores = cross_validate(model, x_train, y_train, cv=10,scoring=('r2','neg_mean_absolute_error'),n_jobs=-1)
    
    print("***",item,"***")
    print("MAE Train :",scores['train_neg_mean_absolute_error'].mean())
    print("MAE Test :",scores['test_neg_mean_absolute_error'].mean())
    print("R2 Train:",scores['train_r2'].mean())
    print("R2 Test:",scores['test_r2'].mean())
    print()

    

*** minmax ***
MAE Train : -0.000416616658723
MAE Test : -0.000418689667231
R2 Train: 0.0483965106633
R2 Test: -1.24426989209

*** standard ***
MAE Train : -0.000416616658723
MAE Test : -0.000418689667231
R2 Train: 0.0483965106633
R2 Test: -1.24426989209

*** maxabs ***
MAE Train : -0.000416616658723
MAE Test : -0.000418689667231
R2 Train: 0.0483965106633
R2 Test: -1.24426989209

*** robust ***
MAE Train : -0.000416616658723
MAE Test : -0.000418689667231
R2 Train: 0.0483965106633
R2 Test: -1.24426989209

*** QuantileTransformer ***
MAE Train : -0.000432378159426
MAE Test : -0.000432404727004
R2 Train: 0.0563959546827
R2 Test: 0.0570229344205

*** Normalizer ***
MAE Train : -0.000422517089167
MAE Test : -0.000422756207492
R2 Train: 0.0501181049947
R2 Test: 0.0424907401237



##### Min Max, Maxabs, Standard  and robust all came up with almost similar RMSE. Hence moving forward i'll be using standard Scaler.

In [78]:
df,x_train,x_train_weight,x_test,x_test_weight,y_train,y_test=getxy()
X_train=scaling('standard',x_train)
y_train=y_train.reshape(-1,1)
y_train=scaling('standard',y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


In [66]:
featureName=list(df.drop(['y','Weight'],axis=1).columns)
featureName
# featureList=[0,1,2,3,4,5,6,7,8,9,10,12,13]
# for i in range(len(featureList)):
#     x_train=np.delete(X_train,i,1)
    
#     model = LinearRegression()
#     scores = cross_validate(model, x_train, y_train, cv=10,scoring=('r2','neg_mean_absolute_error'),n_jobs=-1)
#     aveError=np.mean([scores['train_neg_mean_absolute_error'].mean(),scores['test_neg_mean_absolute_error'].mean()])
#     print("Error :",aveError)



['Market',
 'Day',
 'Stock',
 'x0',
 'x1',
 'x2',
 'x3A',
 'x3B',
 'x3C',
 'x3D',
 'x3E',
 'x4',
 'x5',
 'x6']

In [52]:
np.mean([[1,2,3],[3,4,5]])

3.0

In [2]:
def getdf():
    df=pd.read_csv('dataset/new_train.csv',index_col='Index')
    df=df.dropna()
    return df

In [3]:
def encode(df):
    df["Market"] = df["Market"].astype('category')
    df["Day"] = df["Day"].astype('category')
    df["Stock"] = df["Stock"].astype('category')
    return df

In [4]:
def scale(type_,df):
    if type_=='minmax':
        scaler = MinMaxScaler()
    elif type_=='standard':
        scaler = StandardScaler()
    elif type_=='maxabs':
        scaler= MaxAbsScaler()
    elif type_=='robust':
        scaler= RobustScaler()
    elif type_=='QuantileTransformer':
        scaler=QuantileTransformer()
    elif type_=='Normalizer':
        scaler=Normalizer()    
#     df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']]=scaling('standard',df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']])
    
    
    df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']]=scaler.fit_transform(df[['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']])
    return df

In [5]:
def errorValue(y_pred,y,wt):
    return (sum(wt*((y_pred-y)**2)))

In [6]:
def sub_lists(my_list):
    subs = [[]]
    for i in range(len(my_list)):
        n = i+1
        while n <= len(my_list):
            sub = my_list[i:n]
            subs.append(sub)
            n += 1
    return subs

In [9]:
df=getdf()
df=encode(df)
df=scale('standard',df)
wt=df['Weight']
x=df.drop(['Weight','y'],axis=1)    
y=df['y']    

In [169]:
# features=['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']
X=x[['x4','x0']]
model = LinearRegression()
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid = GridSearchCV(model,parameters, cv=4)
grid.fit(X, y)
y_pred=grid.predict(X)
print("Error Calculated :",errorValue(y_pred,y,wt))




# # scores = cross_validate(model, x, y, cv=10,scoring=('neg_mean_absolute_error'))
# print(np.mean([scores['test_score'].mean(),scores['train_score'].mean()]))

Error Calculated : 0.7460346979466801


In [194]:
features=['x0','x1','x2','x3A','x3B','x3C','x3D','x3E','x4','x5','x6']
featureList=sub_lists(features)
# featureList[50]
mainList=['Market','Day','Stock']
mainList.extend(featureList[11])
print(mainList)

['Market', 'Day', 'Stock', 'x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6']


In [201]:
result=[]
for i in range(len(featureList)):
    mainList=['Market','Day','Stock']
    mainList.extend(featureList[i])
    X=x[mainList]
    model = LinearRegression()
    parameters = {}
    grid = GridSearchCV(model,parameters, cv=4)
    grid.fit(X, y)
    y_pred=grid.predict(X)
    print(featureList[i],":",round(errorValue(y_pred,y,wt),5))
    result.append([mainList,errorValue(y_pred,y,wt)])
        

[] : 0.80655
['x0'] : 0.80653
['x0', 'x1'] : 0.80652
['x0', 'x1', 'x2'] : 0.80654
['x0', 'x1', 'x2', 'x3A'] : 0.80764
['x0', 'x1', 'x2', 'x3A', 'x3B'] : 0.80904
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C'] : 0.80621
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D'] : 0.80364
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E'] : 0.80033
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4'] : 0.72906
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5'] : 0.72888
['x0', 'x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5', 'x6'] : 0.72945
['x1'] : 0.80654
['x1', 'x2'] : 0.80656
['x1', 'x2', 'x3A'] : 0.80766
['x1', 'x2', 'x3A', 'x3B'] : 0.80906
['x1', 'x2', 'x3A', 'x3B', 'x3C'] : 0.80623
['x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D'] : 0.80365
['x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E'] : 0.80034
['x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4'] : 0.72909
['x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E', 'x4', 'x5'] : 0.72891
['x1', 'x2', 'x3A', 'x3B', 'x3C', 'x3D', 'x3E'

In [215]:
result_df=pd.DataFrame(result,columns=['Feature Name','Score'])
result_df=result_df.sort_values(by=['Score'])
print(result_df.head(1))

                                         Feature Name     Score
10  [Market, Day, Stock, x0, x1, x2, x3A, x3B, x3C...  0.728882


In [218]:
X=x
parameters = {}
model = RandomForestRegressor()
grid = GridSearchCV(model,parameters, cv=4,verbose = 2,n_jobs=-1)
grid.fit(X, y)
y_pred=grid.predict(X)
print(errorValue(y_pred,y,wt),5)

Fitting 4 folds for each of 1 candidates, totalling 4 fits
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................................. , total= 1.4min
[CV] ................................................. , total= 1.4min
[CV] ................................................. , total= 1.4min
[CV] ................................................. , total= 1.5min


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.5min finished


0.23982239196780128 5


In [14]:
X=x
X=pd.get_dummies(x, prefix=['Market','Stock'], columns=['Market','Stock'])
# parameters = {}
# model = XGBRegressor()
# grid = GridSearchCV(model,parameters, cv=4,verbose = 2,n_jobs=-1)
# grid.fit(X, y)
# y_pred=grid.predict(X)
# print(errorValue(y_pred,y,wt),5)
X.head()

Unnamed: 0_level_0,Day,x0,x1,x2,x3A,x3B,x3C,x3D,x3E,x4,...,Stock_3013,Stock_3014,Stock_3015,Stock_3016,Stock_3017,Stock_3018,Stock_3019,Stock_3020,Stock_3021,Stock_3022
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
495730,539,-0.004497,-0.134356,-0.233919,-0.056778,-0.080729,-0.072594,-0.055064,-0.048578,-0.400547,...,0,0,0,0,0,0,0,0,0,0
325010,223,-0.004498,-0.216543,-0.351575,-0.020525,-0.03574,-0.020605,-0.012077,0.026079,0.848011,...,0,0,0,0,0,0,0,0,0,0
11980,86,-0.004497,-0.117897,-0.325717,-0.041993,-0.054836,-0.034117,-0.020956,0.009538,0.331451,...,0,0,0,0,0,0,0,0,0,0
550150,667,-0.004493,-0.137405,-0.242465,-0.064721,-0.092181,-0.080058,-0.062987,-0.058309,-0.662396,...,0,0,0,0,0,0,0,0,0,0
234071,64,-0.004496,1.614595,4.074102,-0.06366,-0.090432,-0.078764,-0.062149,-0.057613,-0.427348,...,0,0,0,0,0,0,0,0,0,0


In [16]:
model = LinearRegression()
model.fit(X,y)
y_pred=grid.predict(X)
print(errorValue(y_pred,y,wt))

MemoryError: 