In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

% matplotlib inline

import seaborn as sns

In [4]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [5]:
train.shape

(4209, 378)

In [6]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [7]:
train['y'].dtype

dtype('float64')

In [8]:
cols = [x for x in train.columns if 'X' in x]

In [9]:
train[cols].dtypes.value_counts()

int64     368
object      8
dtype: int64

In [10]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import xgboost as xgb



In [11]:
y_train = train['y'].values

y_mean = np.mean(y_train)

In [149]:
rf = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, max_depth=3)
                           
et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, min_samples_leaf=35, max_features=150)

xgbm = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.9, base_score=y_mean,
                                objective='reg:linear', n_estimators=1000)

en = ElasticNet()

lreg = LinearRegression()

# Feature Selection

In [13]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [14]:
train_cols = cols[8:350]

X = train[train_cols]
y = y_train

In [15]:
log = {}

for col in train_cols:
    #print(col)
    results = cross_val_score(rf, X[[col]], y_train, cv=5, scoring='r2')
    log[col] = results.mean()
    print(" %s : %.4f (%.4f)" % (col, results.mean(), results.std()))

 X10 : -0.0082 (0.0117)
 X11 : -0.0082 (0.0117)
 X12 : -0.0010 (0.0139)
 X13 : -0.0095 (0.0126)
 X14 : 0.0311 (0.0220)
 X15 : -0.0084 (0.0119)
 X16 : -0.0083 (0.0120)
 X17 : -0.0082 (0.0115)
 X18 : -0.0084 (0.0119)
 X19 : 0.0204 (0.0149)
 X20 : 0.0016 (0.0112)
 X21 : -0.0084 (0.0119)
 X22 : -0.0001 (0.0152)
 X23 : -0.0026 (0.0158)
 X24 : -0.0085 (0.0121)
 X26 : -0.0083 (0.0119)
 X27 : -0.0066 (0.0121)
 X28 : 0.0571 (0.0162)
 X29 : 0.1388 (0.0163)
 X30 : -0.0082 (0.0115)
 X31 : -0.0017 (0.0129)
 X32 : -0.0084 (0.0119)
 X33 : -0.0083 (0.0116)
 X34 : -0.0084 (0.0120)
 X35 : -0.0015 (0.0127)
 X36 : -0.0082 (0.0117)
 X37 : -0.0018 (0.0129)
 X38 : -0.0085 (0.0120)
 X39 : -0.0083 (0.0119)
 X40 : -0.0084 (0.0118)
 X41 : -0.0085 (0.0114)
 X42 : -0.0084 (0.0118)
 X43 : 0.0315 (0.0151)
 X44 : -0.0053 (0.0117)
 X45 : -0.0061 (0.0118)
 X46 : 0.0108 (0.0163)
 X47 : 0.0041 (0.0102)
 X48 : 0.0292 (0.0135)
 X49 : -0.0089 (0.0129)
 X50 : -0.0063 (0.0104)
 X51 : 0.0459 (0.0165)
 X52 : 0.0262 (0.0093)
 X5

In [16]:
neg_cols = []
pos_cols = []

for col in log.keys():
    if log[col] > 0:
        pos_cols.append(col)
    else:
        neg_cols.append(col)

In [17]:
print("# Pos feats : ", len(pos_cols))
print("# Neg feats : ", len(neg_cols))

# Pos feats :  104
# Neg feats :  238


In [18]:
# LR to train on numeric features
log_lr = {}

for col in train_cols:
    #print(col)
    results = cross_val_score(lreg, train[[col]], y_train, cv=5, scoring='r2')
    log_lr[col] = results.mean()
    print(" %s : %.4f (%.4f)" % (col, results.mean(), results.std()))

 X10 : -0.0083 (0.0120)
 X11 : -0.0083 (0.0117)
 X12 : -0.0008 (0.0136)
 X13 : -0.0094 (0.0126)
 X14 : 0.0311 (0.0220)
 X15 : -0.0084 (0.0114)
 X16 : -0.0068 (0.0117)
 X17 : 0.0168 (0.0236)
 X18 : -0.0090 (0.0114)
 X19 : 0.0205 (0.0148)
 X20 : 0.0014 (0.0113)
 X21 : -0.0083 (0.0123)
 X22 : -0.0002 (0.0153)
 X23 : -0.0027 (0.0159)
 X24 : -0.0084 (0.0118)
 X26 : -0.0081 (0.0122)
 X27 : -0.0067 (0.0122)
 X28 : 0.0571 (0.0161)
 X29 : 0.1388 (0.0163)
 X30 : -0.0063 (0.0089)
 X31 : -0.0017 (0.0129)
 X32 : -0.0084 (0.0117)
 X33 : -0.0083 (0.0118)
 X34 : -0.0049 (0.0132)
 X35 : -0.0017 (0.0129)
 X36 : -0.0081 (0.0121)
 X37 : -0.0017 (0.0129)
 X38 : -0.0102 (0.0153)
 X39 : -0.0083 (0.0118)
 X40 : -0.0097 (0.0140)
 X41 : -0.0091 (0.0113)
 X42 : -0.0083 (0.0117)
 X43 : 0.0317 (0.0151)
 X44 : -0.0031 (0.0128)
 X45 : -0.0061 (0.0117)
 X46 : 0.0108 (0.0162)
 X47 : 0.0049 (0.0106)
 X48 : 0.0292 (0.0135)
 X49 : -0.0089 (0.0131)
 X50 : -0.0064 (0.0105)
 X51 : 0.0460 (0.0165)
 X52 : 0.0264 (0.0092)
 X53

In [19]:
lr_neg_cols = []
lr_pos_cols = []

for col in log_lr.keys():
    if log_lr[col] > 0:
        lr_pos_cols.append(col)
    else:
        lr_neg_cols.append(col)

In [22]:
print("# lr Pos feats : ", len(lr_pos_cols))
print("# lr Neg feats : ", len(lr_neg_cols))

# lr Pos feats :  110
# lr Neg feats :  232


In [23]:
from sklearn.preprocessing import LabelEncoder

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values)) 
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

In [24]:
c_cols = cols[:8]

In [25]:
c_cols

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

In [26]:
# RF to train on Categorical features

log_rf = {}

for col in c_cols:
    #print(col)
    results = cross_val_score(rf, train[[col]], y_train, cv=5, scoring='r2')
    log_rf[col] = results.mean()
    print(" %s : %.4f (%.4f)" % (col, results.mean(), results.std()))

 X0 : 0.3368 (0.0276)
 X1 : 0.0134 (0.0177)
 X2 : 0.1522 (0.0425)
 X3 : 0.0326 (0.0261)
 X4 : -0.0083 (0.0117)
 X5 : -0.0132 (0.0083)
 X6 : -0.0026 (0.0106)
 X8 : 0.0036 (0.0098)


In [27]:
# LR to train on Categorical features
log_lr2 = {}

for col in c_cols:
    #print(col)
    results = cross_val_score(lreg, train[[col]], y_train, cv=5, scoring='r2')
    log_lr2[col] = results.mean()
    print(" %s : %.4f (%.4f)" % (col, results.mean(), results.std()))

 X0 : 0.0278 (0.0182)
 X1 : -0.0085 (0.0116)
 X2 : -0.0035 (0.0125)
 X3 : 0.0142 (0.0230)
 X4 : -0.0082 (0.0116)
 X5 : -0.0067 (0.0083)
 X6 : -0.0083 (0.0117)
 X8 : -0.0085 (0.0115)


# Train on Categorical Features

In [28]:
train_ccols = ['X0', 'X1', 'X2', 'X3', 'X8']

In [29]:
# random forest
results = cross_val_score(rf, train[train_ccols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.3204 (0.0417)


In [30]:
# Linear Regression
results = cross_val_score(lreg, train[train_ccols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.0540 (0.0179)


In [31]:
# ElasticNet
results = cross_val_score(en, train[train_ccols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.0521 (0.0128)


# Train on numeric features

In [32]:
# pos_cols
# neg_cols
# lr_pos_cols
# lr_neg_cols

In [173]:
print(pos_cols)

['X14', 'X19', 'X20', 'X28', 'X29', 'X43', 'X46', 'X47', 'X48', 'X51', 'X52', 'X54', 'X61', 'X66', 'X68', 'X71', 'X75', 'X76', 'X80', 'X84', 'X85', 'X96', 'X98', 'X101', 'X108', 'X111', 'X113', 'X115', 'X118', 'X119', 'X120', 'X126', 'X127', 'X128', 'X130', 'X132', 'X134', 'X136', 'X142', 'X147', 'X148', 'X150', 'X151', 'X155', 'X156', 'X157', 'X158', 'X159', 'X162', 'X166', 'X170', 'X171', 'X178', 'X179', 'X180', 'X185', 'X187', 'X189', 'X191', 'X198', 'X208', 'X209', 'X215', 'X222', 'X223', 'X224', 'X228', 'X229', 'X232', 'X234', 'X238', 'X241', 'X244', 'X250', 'X251', 'X255', 'X256', 'X261', 'X263', 'X264', 'X272', 'X273', 'X275', 'X276', 'X279', 'X286', 'X300', 'X301', 'X304', 'X306', 'X311', 'X313', 'X314', 'X315', 'X316', 'X328', 'X331', 'X343', 'X348', 'X349', 'X350', 'X352', 'X354', 'X355']


In [34]:
# random forest on pos_cols
results = cross_val_score(rf, train[pos_cols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5680 (0.0697)


In [172]:
# xgbm on pos_cols
results = cross_val_score(xgbm, train[pos_cols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5690 (0.0742)


In [None]:
sns.heamap()

In [36]:
# linear regression on pos_cols
results = cross_val_score(en, train[pos_cols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.3533 (0.0453)


In [37]:
# random forest on lr_pos_cols
results = cross_val_score(rf, train[lr_pos_cols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5672 (0.0692)


In [38]:
# random forest on neg_cols
results = cross_val_score(rf, train[neg_cols], y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.1254 (0.0239)


In [39]:
# see which features got positive r2_score on lr and neg in pos_cols
for c in lr_pos_cols:
    if c not in pos_cols:
        print(c, log_lr[c])

X17 0.0168124150441
X169 0.00265012391091
X221 0.00158562093859
X267 0.00021867700692
X274 0.0215419824317
X325 0.0103633784806


# Add PCA/ICA/SVD

In [167]:
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

In [168]:
len(train_cols)

342

In [169]:
#X_train = train[train_cols]
#X_test = test[train_cols]
X_train = train[pos_cols]
X_test = test[pos_cols]

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

X_train shape:  (4209, 104)
X_test shape:  (4209, 104)


In [170]:
##Add decomposed components: PCA / ICA etc.
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
n_comp = 12


# read datasets
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')


# process columns, apply LabelEncoder to categorical features
for c in train_df.columns:
    if train_df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train_df[c].values) + list(test_df[c].values))
        train_df[c] = lbl.transform(list(train_df[c].values))
        test_df[c] = lbl.transform(list(test_df[c].values))

# shape
print('Shape train: {}\nShape test: {}'.format(train_df.shape, test_df.shape))


# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train_df)
tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train_df)
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_df)
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df)
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(X_train)
srp_results_test = srp.transform(X_test)

Shape train: (4209, 378)
Shape test: (4209, 377)


ValueError: shapes (4209,377) and (378,12) not aligned: 377 (dim 1) != 378 (dim 0)

In [129]:
# Append decomposition components to datasets
for i in range(1, n_comp+1):
    X_train['pca_' + str(i)] = pca2_results_train[:,i-1]
    X_test['pca_' + str(i)] = pca2_results_test[:, i-1]

    X_train['ica_' + str(i)] = ica2_results_train[:,i-1]
    X_test['ica_' + str(i)] = ica2_results_test[:, i-1]

    X_train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    X_test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    X_train['grp_' + str(i)] = grp_results_train[:,i-1]
    X_test['grp_' + str(i)] = grp_results_test[:, i-1]

    X_train['srp_' + str(i)] = srp_results_train[:,i-1]
    X_test['srp_' + str(i)] = srp_results_test[:, i-1]
    
# Append decomposition components to datasets
#for i in range(1, n_comp+6):
#    X_train['ica_' + str(i)] = ica2_results_train[:,i-1]
#    X_test['ica_' + str(i)] = ica2_results_test[:, i-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-

In [163]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=123, num_rounds=1500):
    """
        'n_trees': 520,
        'eta': 0.0045,
        'max_depth': 4,
        'subsample': 0.98,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'base_score': y_mean, # base prediction = mean(target)
        'silent': 1
    """ 
    param = {}
    #param['n_trees'] = 520
    param['objective'] = 'reg:linear'
    param['eta'] = 0.0045
    param['max_depth'] = 4
    param['silent'] = 1
    param['eval_metric'] = "rmse"
    param['min_child_weight'] = 3
    param['subsample'] = 0.98
    param['colsample_bytree'] = 1
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [164]:
X_train.shape

(4209, 169)

In [165]:
X_test.shape

(4209, 169)

In [166]:
from sklearn.metrics import r2_score
from sklearn import model_selection, preprocessing, ensemble
train_X = train[pos_cols].values
train_y = y_train
#test_X = X_test

cv_scores = []
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(r2_score(val_y, preds))
        print(cv_scores)
        break

[0]	train-rmse:100.589	test-rmse:100.232
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 20 rounds.
[1]	train-rmse:100.14	test-rmse:99.7835
[2]	train-rmse:99.6932	test-rmse:99.3368
[3]	train-rmse:99.2487	test-rmse:98.8922
[4]	train-rmse:98.8061	test-rmse:98.4495
[5]	train-rmse:98.3654	test-rmse:98.0087
[6]	train-rmse:97.9267	test-rmse:97.5699
[7]	train-rmse:97.4901	test-rmse:97.1333
[8]	train-rmse:97.0555	test-rmse:96.6986
[9]	train-rmse:96.6227	test-rmse:96.2657
[10]	train-rmse:96.1921	test-rmse:95.8351
[11]	train-rmse:95.7635	test-rmse:95.4063
[12]	train-rmse:95.3365	test-rmse:94.9793
[13]	train-rmse:94.9117	test-rmse:94.5544
[14]	train-rmse:94.4886	test-rmse:94.1312
[15]	train-rmse:94.0675	test-rmse:93.7099
[16]	train-rmse:93.6485	test-rmse:93.2909
[17]	train-rmse:93.2314	test-rmse:92.8736
[18]	train-rmse:92.8161	test-rmse:92.4582
[19]	train-rmse:92.4027	test-rmse:92.0447
[20]	train-rmse:91.9911	test

In [144]:
preds, model = runXGB(train_X, train_y, test[pos_cols].values, num_rounds=1500)

In [145]:
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': preds})

output.to_csv('../output/feature_selection_v4.csv', index=False)

In [None]:
# GradientBoosting
gbm = GradientBoostingRegressor(learning_rate=0.04)

In [137]:
# random forest on pos_cols
results = cross_val_score(xgbm, X_train, y_train, cv=5, scoring='r2')
print("%.4f (%.4f)" % (results.mean(), results.std()))

0.5613 (0.0693)
