In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD,NMF
from sklearn.metrics import r2_score
from sklearn.manifold import TSNE, MDS,Isomap



class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))

# train = train.T.drop_duplicates().T
# test = test.T.drop_duplicates().T


n_comp = 20

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# NMF
nmf = NMF(n_components=n_comp, init=None, solver='cd', tol=0.0001, max_iter=200, random_state=420, alpha=0.0, 
          l1_ratio=0.0, verbose=0, shuffle=False, nls_max_iter=2000, sparseness=None, beta=1, eta=0.1)
nmf_results_train = nmf.fit_transform(train.drop(["y"], axis=1))
nmf_results_test = nmf.transform(test)

# NMF
tsne = TSNE(n_components=5,  random_state=420)
tsne_results_train = tsne.fit_transform(train.drop(["y"], axis=1))
tsne_results_test = tsne.fit_transform(test)

#MDS
mds = MDS(n_components=5,  random_state=420)
mds_results_train = mds.fit_transform(train.drop(["y"], axis=1))
mds_results_test = mds.fit_transform(test)

ism = Isomap(n_components=n_comp)
ism_results_train = ism.fit_transform(train.drop(["y"], axis=1))
ism_results_test = ism.transform(test)


#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

    train['nmf_' + str(i)] = nmf_results_train[:, i - 1]
    test['nmf_' + str(i)] = nmf_results_test[:, i - 1]

    train['ism_' + str(i)] = ism_results_train[:, i - 1]
    test['ism_' + str(i)] = ism_results_test[:, i - 1]

#     train['mds_' + str(i)] = mds_results_train[:, i - 1]
#     test['mds_' + str(i)] = mds_results_test[:, i - 1]

#     train['mds_' + str(i)] = mds_results_train[:, i - 1]
#     test['mds_' + str(i)] = mds_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))



In [7]:
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

x_train = np.array(train.drop('y', axis=1))
x_test = np.array(test)
'''Train the xgb model then predict the test data'''
print x_train.shape
print x_test.shape
xgb_params = {
#     'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'colsample_by_tree' : 0.65,
    'objective': 'reg:linear',
#     'gamma' : np.log(2),
#     'min_child_weight' : np.log(10),
#     'reg_alpha' : np.log(2),
#     'eval_metric': 'rmse',
#     'base_score': y_mean, # base prediction = mean(target)
#     'silent': 1,
    'seed' : 10,
}

from sklearn import model_selection
from sklearn.metrics import log_loss,mean_squared_error,r2_score
from sklearn.model_selection import KFold

def r2_eval(y, y0):
    y0=y0.get_label()    
    assert len(y) == len(y0)
    return 'error',-(r2_score((y0),(y)))

# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

cv_output = xgb.cv(dict(xgb_params), dtrain, num_boost_round=5000,nfold = 10, feval = r2_eval,
                   early_stopping_rounds=25,verbose_eval=100, show_stdv=True)
print('best num_boost_rounds = ', len(cv_output))
num_boost_rounds = len(cv_output) 
# num_boost_rounds = 1250
# # train model
# model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
# y_pred = model.predict(dtest)
# [1600]	train-error:-0.65493+0.00891481	train-rmse:0.0581227+0.000641751	test-error:-0.572537+0.0903409	test-rmse:0.0648515+0.00689838

(4209, 517)
(4209, 517)
[0]	train-error:67.337+1.00852	train-rmse:4.13627+0.000710392	test-error:68.5428+7.70445	test-rmse:4.13626+0.00641949
[100]	train-error:59.9489+0.896892	train-rmse:2.6363+0.000452564	test-error:61.021+6.85127	test-rmse:2.63631+0.00597248
[200]	train-error:46.0227+0.687006	train-rmse:1.68094+0.000293649	test-error:46.843+5.24342	test-rmse:1.68094+0.00521488
[300]	train-error:29.9328+0.445221	train-rmse:1.07269+0.000198254	test-error:30.4628+3.38545	test-rmse:1.07267+0.00489446
[400]	train-error:16.7851+0.248649	train-rmse:0.685869+0.000161564	test-error:17.0804+1.8777	test-rmse:0.685887+0.00491839
[500]	train-error:8.31567+0.121778	train-rmse:0.440581+0.000169186	test-error:8.4611+0.920751	test-rmse:0.440608+0.00510403
[600]	train-error:3.65581+0.0526242	train-rmse:0.286052+0.000218876	test-error:3.7224+0.411158	test-rmse:0.286156+0.00543302
[700]	train-error:1.34116+0.0195735	train-rmse:0.190059+0.000318611	test-error:1.3743+0.179001	test-rmse:0.190399+0.0059283

In [3]:
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

x_train = np.array(train.drop('y', axis=1))
x_test = np.array(test)
'''Train the xgb model then predict the test data'''
print x_train.shape
print x_test.shape
# xgb_params = {
# #     'n_trees': 520, 
#     'eta': 0.005,
#     'max_depth': 6,
#     'subsample': 0.95,
#     'colsample_by_tree'
#     'objective': 'reg:linear',
#     'eval_metric': 'rmse',
#     'base_score': y_mean, # base prediction = mean(target)
#     'silent': 1,
# #     'seed' : 2016,
# }
from sklearn import model_selection
from sklearn.metrics import log_loss,mean_squared_error,r2_score
from sklearn.model_selection import KFold

def r2_eval(y, y0):
    y0=y0.get_label()    
    assert len(y) == len(y0)
    return 'error',-(r2_score(((y0)),(y)))


# Set our parameters for xgboost
params = {}
params['objective'] = 'reg:linear'
# params['n_trees'] = 520
# params['eval_metric'] = 'rmse'
params['eta'] = 0.0045
params['max_depth'] = 4
params['seed'] = 420
# params['gamma'] = 2
params['subsample'] = 0.93
params['colsample_bytree'] = 0.4
# params['base_score'] = y_mean
# params['min_child_weight'] = 10
# params['reg_alpha'] = 2
# params['reg_lambda'] = 2
params['n_jobs'] = 32

# NOTE: Make sure that the class is labeled 'class' in the data file

# dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
# dtest = xgb.DMatrix(test)

# num_boost_rounds = 1250
# # train model
# model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
# y_pred = model.predict(dtest)


# y_train = np.array(y_train)

train_stacker=[ [0.0 for s in range(1)]  for k in range (0,(x_train.shape[0])) ]

cv_scores = []
oof_preds = []
a = [0 for x in range(0,x_test.shape[0])]
# StratifiedKFold
# kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=2016)
# for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=420)
for dev_index, val_index in kf.split(range(x_train.shape[0])):
        dev_X, val_X = x_train[dev_index,:], x_train[val_index,:]
        dev_y, val_y = y_train[dev_index], y_train[val_index]

        d_train = xgb.DMatrix(dev_X, label=dev_y)
        d_valid = xgb.DMatrix(val_X, label=val_y)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        bst = xgb.train(params, d_train, 5000, watchlist, feval = r2_eval, early_stopping_rounds=50, verbose_eval=100)
        # ntree_limit=model.best_ntree_limit
        preds = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)

        cv_scores.append((r2_score(val_y  , preds)))

        print(cv_scores)
#         break
        
        d_test = xgb.DMatrix(x_test)
        preds_tr = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
        
        a = np.column_stack((a,preds_tr))

        no=0
        for real_index in val_index:
            for d in range (0,1):
                train_stacker[real_index][d]=(preds[no])
            no+=1

# [0.6083777650601262, 0.54277680892907387, 0.61289682196068918, 0.47706217814929963, 0.61092343331253285]

# [0.63310415552712973, 0.59781091047231094, 0.62987554580494609, 0.55566451968699759, 0.42448092020632788]

(4209, 517)
(4209, 517)
[0]	train-rmse:100.539	valid-rmse:100.433	train-error:60.8453	valid-error:66.2964
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 50 rounds.
[100]	train-rmse:64.4495	valid-rmse:64.2335	train-error:24.4145	valid-error:26.5275
[200]	train-rmse:41.6259	valid-rmse:41.2975	train-error:9.60153	valid-error:10.3786
[300]	train-rmse:27.3388	valid-rmse:26.9009	train-error:3.57298	valid-error:3.82812
[400]	train-rmse:18.5698	valid-rmse:18.0297	train-error:1.10986	valid-error:1.1688
[500]	train-rmse:13.3804	valid-rmse:12.7877	train-error:0.095422	valid-error:0.090999
[600]	train-rmse:10.4812	valid-rmse:9.9031	train-error:-0.327852	valid-error:-0.345687
[700]	train-rmse:8.9774	valid-rmse:8.46419	train-error:-0.506891	valid-error:-0.522015
[800]	train-rmse:8.21133	valid-rmse:7.8192	train-error:-0.587458	valid-error:-0.592087
[900]	train-rmse:7.80886	valid-rmse:7.55221	train-error:-0.626907

In [15]:
ts = np.exp(train_stacker) - 1

In [4]:
b = pd.DataFrame(a)

b['sum'] = b.sum(axis = 1)/5

np.savetxt("xg_1_train.gz",train_stacker, delimiter=",", fmt='%.6f')

np.savetxt("xg_1_test.gz", b['sum'], delimiter=",", fmt='%.6f')

In [13]:
b['sum']

0        84.956982
1        96.713034
2        84.396162
3        84.077484
4       115.538089
5        93.197830
6       113.588580
7        95.214503
8       119.768359
9        97.151657
10      118.995477
11      106.476736
12       99.437073
13       95.174823
14      105.545950
15      101.190536
16      117.877576
17       99.337416
18       95.250378
19       94.297211
20       96.819792
21       95.831041
22       93.066489
23       96.526428
24       93.223253
25      116.718222
26      103.982430
27      104.472185
28       93.900458
29       84.755946
           ...    
4179    109.617030
4180    101.696504
4181     92.450688
4182     92.347440
4183    101.663240
4184    110.027531
4185     92.144193
4186     94.257620
4187    111.145135
4188    109.264182
4189     91.539748
4190    109.825067
4191     90.967030
4192    101.271576
4193     92.287230
4194    111.092201
4195     92.744778
4196    102.462994
4197    101.287067
4198    110.733488
4199     91.757121
4200     90.

In [63]:
b = pd.DataFrame(a)
b['sum'] = b.sum(axis = 1)/5

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = b['sum']
sub.to_csv('single_xgb6.csv', index=False)

In [None]:
y1 = [0 for x in range(x_train.shape[0])]
y2 = [1 for x in range(x_test.shape[0])]

yn = np.column_stack((y1,y2))
print yn.shape

y_train = np.column_stack((np.log(train['y'].values + 1)), np.log(b['sum'] + 1))
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

x_train = np.array(train.drop('y', axis=1))
x_test = np.array(test)
X_train = np.column_stack((x_train,x_test))

'''Train the xgb model then predict the test data'''
print X_train.shape
print x_test.shape

train_stacker=[ [0.0 for s in range(1)]  for k in range (0,(X_train.shape[0])) ]

cv_scores = []
oof_preds = []
a = [0 for x in range(0,x_test.shape[0])]
# StratifiedKFold
# kf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=RS)
# for dev_index, val_index in kf.split(range(x_train.shape[0]),y_train):
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=420)
for dev_index, val_index in kf.split(range(X_train.shape[0])):
        dev_X, val_X = X_train[dev_index,:], X_train[val_index,:]
        dev_y, val_y, dev_yn, val_yn = y_train[dev_index], y_train[val_index], yn[dev_index], yn[val_index] 
        print dev_X.shape
        print val_X.shape
        
        pos_train = dev_X[dev_yn == 1]
        neg_train = dev_X[dev_yn == 0]

        print("Oversampling started for proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
        p = 0.25
#         scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
#         while scale > 1:
#             neg_train = np.concatenate((neg_train, neg_train))
#             scale -=1
        pos_train = pos_train[:int(scale * len(pos_train))]
        print("Oversampling done, new proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))

        Xd = np.concatenate((pos_train, neg_train))
        yd = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
        del pos_train, neg_train  

        pos_train = val_X[val_y == 1]
        neg_train = val_X[val_y == 0]

        print("Oversampling started for proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))
        p = 0.165
        scale = ((len(pos_train) / (len(pos_train) + len(neg_train))) / p) - 1
        while scale > 1:
            neg_train = np.concatenate((neg_train, neg_train))
            scale -=1
        neg_train = np.concatenate((neg_train, neg_train[:int(scale * len(neg_train))]))
        print("Oversampling done, new proportion: {}".format(len(pos_train) / (len(pos_train) + len(neg_train))))

        Xv = np.concatenate((pos_train, neg_train))
        yv = (np.zeros(len(pos_train)) + 1).tolist() + np.zeros(len(neg_train)).tolist()
        del pos_train, neg_train  

        print dev_X.shape
        print val_X.shape

        d_train = xgb.DMatrix(Xd, label=yd)
        d_valid = xgb.DMatrix(Xv, label=yv)

        watchlist = [(d_train, 'train'), (d_valid, 'valid')]

        bst = xgb.train(params, d_train, 5000, watchlist, early_stopping_rounds=25, verbose_eval=100)
        # ntree_limit=model.best_ntree_limit
        preds = bst.predict(d_valid, ntree_limit=bst.best_ntree_limit)
        cv_scores.append(log_loss(yv, preds))
        print(cv_scores)
#         break
        
        d_test = xgb.DMatrix(x_test)
        preds_tr = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)

        a = np.column_stack((a,preds_tr))

        d_valorg = xgb.DMatrix(val_X, label=val_y)
        predsorg = bst.predict(d_valorg, ntree_limit=bst.best_ntree_limit)

#         predictions = preds.reshape(-1,1)
        no=0
        for real_index in val_index:
            for d in range (0,1):
                train_stacker[real_index][d]=(predsorg[no])
            no+=1


In [None]:
'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models3.csv', index=False)

In [6]:
print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.25 + model.predict(dtrain)*0.75))

'''Average the preditionon test data  of both models then save it on a csv file'''

# sub = pd.DataFrame()
# sub['ID'] = id_test
# sub['y'] = y_pred*0.71455 + results*0.2855
# sub.to_csv('stacked-models3.csv', index=False)

R2 score on train data:
0.670214876937


'Average the preditionon test data  of both models then save it on a csv file'