In [1069]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import xgboost as xgb

import matplotlib.pyplot as plt

% matplotlib inline

import seaborn as sns

In [1070]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [1071]:
train_magic = pd.read_csv('../input/train_magic.csv')
test_magic = pd.read_csv('../input/test_magic.csv')

In [1072]:
train['meanX0'] = train_magic['meanX0']
test['meanX0'] = test_magic['meanX0']

In [1073]:
y_train = train['y'].values

y_mean = np.mean(y_train)

In [1074]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X376,X377,X378,X379,X380,X382,X383,X384,X385,meanX0
0,0,130.81,k,v,at,a,d,u,j,o,...,0,1,0,0,0,0,0,0,0,99.491818
1,6,88.53,k,t,av,e,d,y,l,o,...,0,0,0,0,0,0,0,0,0,103.974834
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,1,0,0,0,112.552235
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,93.724575
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,95.121361


In [1075]:
"""
features = ['X14', 'X19', 'X20', 'X28', 'X29', 'X43', 'X46', 'X47', 'X48', 'X51', 'X52', 'X54',
            'X61', 'X66', 'X68', 'X71', 'X75', 'X76', 'X80', 'X84', 'X85', 'X96', 'X98', 'X101',
            'X108', 'X111', 'X113', 'X115', 'X118', 'X119', 'X120', 'X126', 'X127', 'X128', 'X130',
            'X132', 'X134', 'X136', 'X142', 'X147', 'X148', 'X150', 'X151', 'X155', 'X156', 'X157',
            'X158', 'X159', 'X162', 'X166', 'X170', 'X171', 'X178', 'X179', 'X180', 'X185', 'X187',
            'X189', 'X191', 'X198', 'X208', 'X209', 'X215', 'X222', 'X223', 'X224', 'X228', 'X229',
            'X232', 'X234', 'X238', 'X241', 'X244', 'X250', 'X251', 'X255', 'X256', 'X261', 'X263',
            'X264', 'X272', 'X273', 'X275', 'X276', 'X279', 'X286', 'X300', 'X301', 'X304', 'X306', 
            'X311', 'X313', 'X314', 'X315', 'X316', 'X328', 'X331', 'X343' ,'meanX0']

'X355', =
'X354' +
'X352' ---xxxxxx
, 'X350' =
, 'X349' =
, 'X348' ++
, 'X343' ++
, 'X331' --xxxxx
, 'X328' =
, 'X316' ------b
, 'X315'-----------b
, 'X314'-------b
, 'X313'+
'X311'--xxxxxxx
, 'X306'+
, 'X304' =
, 'X301' =
, 'X263' =
, 'X261'-------------b
, 'X300'++
, 'X286'++
, 'X279' =
, 'X276'-------b
, 'X275'-------b
, 'X273'---b
, 'X272'---b
'X264'--b
, 'X256'---xxxxxxxxxxxx
, 'X255'+++
, 'X251' =
, 'X250' =
, 'X244' =
, 'X241'++
, 'X238' =
, 'X234' ++
'X232' ++
, 'X229'------b
, 'X228'--b
, 'X224'++
, 'X223'----b
, 'X222'+++++
, 'X215'---b
, 'X209'++
, 'X208' =
, 'X198'---b
, 'X191'------b
 'X189'++
 , 'X187' =
 , 'X185'++
 , 'X180'+++++
 , 'X179'++
 , 'X178'---b
 , 'X171' =
 , 'X170' ++
 , 'X166'++
 , 'X162'---b
 , 'X159'---b
  'X158'---b
  , 'X157'---b
  , 'X156'-b
  , 'X155'-b
  , 'X151'-------b
  , 'X150'-b
  , 'X148'-------b
  , 'X147'-b
  , 'X142'-b
  , 'X136'-b
  
"""
features = ['X14', 'X19', 'X20', 'X28', 'X29', 'X43', 'X46', 'X47', 'X48', 'X51', 'X52', 'X54',
            'X61', 'X66', 'X68', 'X71', 'X75', 'X76', 'X80', 'X84', 'X85', 'X96', 'X98', 'X101',
            'X108', 'X111', 'X113', 'X115', 'X118', 'X119', 'X120', 'X126', 'X127', 'X128', 'X130',
            'X132', 'X134'
           
              
      , 'X136', 'X142', 'X147', 'X148', 'X150' , 'X151' , 'X155' , 'X156', 'X157',
            'X158', 'X159', 'X162', 'X178', 'X191', 'X198', 'X215', 'X223' , 'X228', 'X229' ,
            'X264', 'X272' , 'X273', 'X275', 'X276', 'X261', 'X314', 'X315', 'X316' ,'meanX0']

In [1076]:
train = train[features]
test = test[features]

In [1077]:
train.shape

(4209, 66)

In [1078]:
#xgbm = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.9, base_score=y_mean,
#                                objective='reg:linear', n_estimators=1000)

xgbm = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.0045, subsample=0.98, base_score=y_mean,
                                objective='reg:linear', n_estimators=1000)

In [1079]:
rf = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, max_depth=3)

rf2 = RandomForestRegressor(n_estimators=1000)
                           
et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, min_samples_leaf=35, max_features=150)

xgbm2 = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.9, base_score=y_mean,
                                objective='reg:linear', n_estimators=1000)

en = ElasticNet()

lreg = LinearRegression()

In [1080]:
results = cross_val_score(xgbm, train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5729 (0.0735)


In [1081]:
xgb_model = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.0045, subsample=0.921,
                                     objective='reg:linear', n_estimators=900, base_score=y_mean, min_child_weight=3
                                    , colsample_bytree=1)

results = cross_val_score(xgb_model, train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5739 (0.0720)


In [1082]:
xgbm3 = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.0045, subsample=0.94, base_score=y_mean,
                                objective='reg:linear', n_estimators=1100, min_child_weight=3)

results = cross_val_score(xgbm3, train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5741 (0.0727)


# Models With KBest

###### input features for KBest

k_features = ['X14', 'X19', 'X20', 'X28', 'X29', 'X43', 'X46', 'X47', 'X48', 'X51', 'X52', 'X54',
            'X61', 'X66', 'X68', 'X71', 'X75', 'X76', 'X80', 'X84', 'X85', 'X96', 'X98', 'X101',
            'X108', 'X111', 'X113', 'X115', 'X118', 'X119', 'X120', 'X126', 'X127', 'X128', 'X130',
            'X132', 'X134'  
      , 'X136', 'X142', 'X147', 'X148', 'X150' , 'X151' , 'X155' , 'X156', 'X157',
    'X158', 'X159', 'X162', 'X178', 'X191', 'X198', 'X215', 'X223' , 'X228', 'X229' ,
    'X264', 'X272' , 'X273', 'X275', 'X276', 'X261', 'X314', 'X315', 'X316' ,'meanX0']

In [1083]:
from sklearn.feature_selection import SelectKBest

In [1084]:
# k = 50, gives cv r2_score of 0.5733 with xgbm
# k = 45, cv r2 = 0.5735

kbest = SelectKBest(k=45) 

k_train = kbest.fit_transform(train, y_train)

k_test = kbest.transform(test)

print("Final Train Shape: {}\nFinal Test Shape: {}".format(k_train.shape, k_test.shape))

Final Train Shape: (4209, 45)
Final Test Shape: (4209, 45)


In [1085]:
results = cross_val_score(xgbm, k_train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5735 (0.0729)


In [1086]:
results = cross_val_score(rf, k_train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5661 (0.0678)


In [1087]:
xgb_model = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.0045, subsample=0.921,
                                     objective='reg:linear', n_estimators=900, base_score=y_mean, min_child_weight=3
                                    , colsample_bytree=1)

results = cross_val_score(xgb_model, k_train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5742 (0.0719)


In [1088]:
xgbm3 = xgb.sklearn.XGBRegressor(max_depth=3, learning_rate=0.0045, subsample=0.94, base_score=y_mean,
                                objective='reg:linear', n_estimators=1100, min_child_weight=3)

results = cross_val_score(xgbm3, k_train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5746 (0.0720)
