In [90]:
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from IPython.display import display # Allows the use of display() for DataFrames

import warnings
warnings.filterwarnings('ignore')

In [47]:
train = pd.read_csv('data/train.csv.zip')

test = pd.read_csv('data/test.csv.zip')

In [6]:
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [7]:
test.head()

Unnamed: 0,ID,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000137c73,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00021489f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0004d7953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00056a333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00056d8eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4992 entries, ID to 9fc776466
dtypes: float64(4991), object(1)
memory usage: 1.8+ GB


In [16]:
print("Total Train Features with missing Values = " + str(train.columns[train.isnull().sum() != 0].size))
if (test.columns[test.isnull().sum() != 0].size):
    print("Features with missing => {}".format(list(train.columns[train.isnull().sum() != 0])))
    train[train.columns[train.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Train Features with missing Values = 0


In [17]:
print("Total Test Features with missing Values = " + str(test.columns[test.isnull().sum() != 0].size))
if (test.columns[test.isnull().sum() != 0].size):
    print("Features with missing => {}".format(list(test.columns[test.isnull().sum() != 0])))
    test[test.columns[test.isnull().sum() != 0]].isnull().sum().sort_values(ascending = False)

Total Test Features with missing Values = 0


THere are too many features... let's delete some of them 

In [48]:
# for the numerical values, if the std of a column is == 0 then, simply drop them 
remove = []
for col in train.columns:
    if col != 'ID' and col != 'target':
        if train[col].std() == 0: 
            remove.append(col)
            
print("Removed `{}` Constant Columns\n".format(len(remove)))
print(remove)            

Removed `256` Constant Columns

['d5308d8bc', 'c330f1a67', 'eeac16933', '7df8788e8', '5b91580ee', '6f29fbbc7', '46dafc868', 'ae41a98b6', 'f416800e9', '6d07828ca', '7ac332a1d', '70ee7950a', '833b35a7c', '2f9969eab', '8b1372217', '68322788b', '2288ac1a6', 'dc7f76962', '467044c26', '39ebfbfd9', '9a5ff8c23', 'f6fac27c8', '664e2800e', 'ae28689a2', 'd87dcac58', '4065efbb6', 'f944d9d43', 'c2c4491d5', 'a4346e2e2', '1af366d4f', 'cfff5b7c8', 'da215e99e', '5acd26139', '9be9c6cef', '1210d0271', '21b0a54cb', 'da35e792b', '754c502dd', '0b346adbd', '0f196b049', 'b603ed95d', '2a50e001c', '1e81432e7', '10350ea43', '3c7c7e24c', '7585fce2a', '64d036163', 'f25d9935c', 'd98484125', '95c85e227', '9a5273600', '746cdb817', '6377a6293', '7d944fb0c', '87eb21c50', '5ea313a8c', '0987a65a1', '2fb7c2443', 'f5dde409b', '1ae50d4c3', '2b21cd7d8', '0db8a9272', '804d8b55b', '76f135fa6', '7d7182143', 'f88e61ae6', '378ed28e0', 'ca4ba131e', '1352ddae5', '2b601ad67', '6e42ff7c7', '22196a84c', '0e410eb3d', '992e6d1d3', '90a7

In [51]:
train.drop(remove, axis = 1, inplace = True)
test.drop(remove, axis = 1, inplace = True)

In [54]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4737 entries, ID to 9fc776466
dtypes: float64(1845), int64(2891), object(1)
memory usage: 161.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49342 entries, 0 to 49341
Columns: 4736 entries, ID to 9fc776466
dtypes: float64(4735), object(1)
memory usage: 1.7+ GB


checking for duplicated columns

In [55]:
def duplicate_columns(dat):
    groups = dat.columns.to_series().groupby(dat.dtypes).groups
    dups = []

    for t, v in groups.items():

        cs = dat[v].columns
        vs = dat[v]
        lcs = len(cs)

        for i in range(lcs):
            ia = vs.iloc[:,i].values
            for j in range(i+1, lcs):
                ja = vs.iloc[:,j].values
                if np.array_equal(ia, ja):
                    dups.append(cs[i])
                    break

    return dups

In [70]:
%%time 
remove = duplicate_columns(train)
#print(remove)

train.drop(remove, axis=1, inplace=True) 

# remove duplicate columns in the testing set
test.drop(remove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(remove)))
print(remove)

Removed `5` Duplicate Columns

['34ceb0081', '8d57e2749', '168b3e5bc', 'a765da8bc', 'acc5b709d']
Wall time: 5min 38s


In [None]:
#train = train.loc[:, ~train.columns.duplicated()]
#test = test.loc[:, ~test.columns.duplicated()]
# df.T.drop_duplicates().T

Dropping Sparse Data

In [74]:
def drop_sparse(train, test):
    cols = [x for x in train.columns if not x in ['ID','target']]
    for i in cols:
        if len(np.unique(train[i]))<2:
            train.drop(i, axis=1, inplace=True)
            test.drop(i, axis=1, inplace=True)
    return train, test

In [75]:
%%time
train, test = drop_sparse(train, test)

Wall time: 593 ms


In [76]:
gc.collect()
print("Train set size: {}".format(train.shape))
print("Test set size: {}".format(test.shape))

Train set size: (4459, 4732)
Test set size: (49342, 4731)


Preparing for modelling!

In [95]:
X_train = train.drop(["ID", "target"], axis=1)
y_train = np.log1p(train["target"])

X_test = test.drop(["ID"], axis=1)

Kmean

In [None]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)

PCA

In [None]:
flist = [x for x in X_train.columns if not x in ['ID','target']]

n_components = 20
flist_pca = []
pca = PCA(n_components=n_components)
x_train_projected = pca.fit_transform(normalize(X_train[flist], axis=0))
x_test_projected = pca.transform(normalize(X_test[flist], axis=0))
for npca in range(0, n_components):
    X_train.insert(1, 'PCA_'+str(npca+1), x_train_projected[:, npca])
    X_test.insert(1, 'PCA_'+str(npca+1), x_test_projected[:, npca])
    flist_pca.append('PCA_'+str(npca+1))
print(flist_pca)

In [96]:
dev_X, val_X, dev_y, val_y = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

LIGHT GBM

In [97]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    #converting to lgbm train 
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    evals_result = {}
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150, 
                      evals_result=evals_result)
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    return pred_test_y, model, evals_result

In [98]:
%%time

pred_test, model, evals_result = run_lgb(dev_X, dev_y, val_X, val_y, X_test)
print("LightGBM Training Completed...")

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 1.5082	valid_1's rmse: 1.53919
[300]	training's rmse: 1.34436	valid_1's rmse: 1.46593
[450]	training's rmse: 1.23324	valid_1's rmse: 1.43393
[600]	training's rmse: 1.14931	valid_1's rmse: 1.41848
[750]	training's rmse: 1.08371	valid_1's rmse: 1.41315
[900]	training's rmse: 1.03011	valid_1's rmse: 1.41131
Early stopping, best iteration is:
[934]	training's rmse: 1.01913	valid_1's rmse: 1.41118
LightGBM Training Completed...
Wall time: 1min 5s


In [None]:
# feature importance
print("Features Importance...")
gain = model.feature_importance('gain')
featureimp = pd.DataFrame({'feature':model.feature_name(), 
                   'split':model.feature_importance('split'), 
                   'gain':100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:50])

XGBOOST

In [99]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
   
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(train_X, train_y)
    va_data = xgb.DMatrix(val_X, val_y)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [100]:
%%time 

pred_test_xgb, model_xgb = run_xgb(dev_X, dev_y, val_X, val_y, X_test)
print("XGB Training Completed...")

[0]	train-rmse:14.08765	valid-rmse:14.07678
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[100]	train-rmse:12.76875	valid-rmse:12.75679
[200]	train-rmse:11.57719	valid-rmse:11.56388
[300]	train-rmse:10.50042	valid-rmse:10.48590
[400]	train-rmse:9.52826	valid-rmse:9.51335
[500]	train-rmse:8.65075	valid-rmse:8.63628
[600]	train-rmse:7.85852	valid-rmse:7.84507
[700]	train-rmse:7.14377	valid-rmse:7.13205
[800]	train-rmse:6.49865	valid-rmse:6.48965
[900]	train-rmse:5.91699	valid-rmse:5.91128
[1000]	train-rmse:5.39177	valid-rmse:5.39058
[1100]	train-rmse:4.91908	valid-rmse:4.92352
[1200]	train-rmse:4.49301	valid-rmse:4.50427
[1300]	train-rmse:4.10936	valid-rmse:4.12789
[1400]	train-rmse:3.76422	valid-rmse:3.79155
[1500]	train-rmse:3.45402	valid-rmse:3.49105
[1600]	train-rmse:3.17553	valid-rmse:3.22377
[1700]	train-rmse:2.92593	valid-rmse:2.98597
[1800]	train-rmse:2.70237	valid-rmse:2.77539
[19

Catboost

In [101]:
cb_model = CatBoostRegressor(iterations=500,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50,
                             od_wait=20)

In [103]:
%%time

cb_model.fit(dev_X, dev_y,
             eval_set=(val_X, val_y),
             use_best_model=True,
             verbose=50)



0:	learn: 1.7526762	test: 1.6875093	best: 1.6875093 (0)	total: 2.23s	remaining: 18m 34s
50:	learn: 1.4761562	test: 1.5161235	best: 1.5161235 (50)	total: 2m 28s	remaining: 21m 44s
100:	learn: 1.3754008	test: 1.4772765	best: 1.4772765 (100)	total: 5m	remaining: 19m 45s
150:	learn: 1.3164265	test: 1.4622808	best: 1.4620402 (148)	total: 8m	remaining: 18m 29s
200:	learn: 1.2485595	test: 1.4467180	best: 1.4467180 (200)	total: 10m 56s	remaining: 16m 15s
250:	learn: 1.1708898	test: 1.4352949	best: 1.4352949 (250)	total: 13m 27s	remaining: 13m 20s
300:	learn: 1.1167592	test: 1.4323582	best: 1.4322308 (299)	total: 15m 57s	remaining: 10m 32s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.431742338
bestIteration = 305

Shrink model to first 306 iterations.
Wall time: 17min 14s


<catboost.core.CatBoostRegressor at 0x1fc171a0c50>

Combine the results

In [106]:
sub = pd.read_csv('data/sample_submission.csv.zip')

sub_lgb = pd.DataFrame()
sub_lgb["target"] = pred_test

sub_xgb = pd.DataFrame()
sub_xgb["target"] = pred_test_xgb

sub_cat = pd.DataFrame()
sub_cat["target"] = pred_test_cat

sub["target"] = (sub_lgb["target"] * 0.5 + sub_xgb["target"] * 0.3 + sub_cat["target"] * 0.2)

NameError: name 'pred_test_cat' is not defined

https://www.kaggle.com/samratp/aggregates-sumvalues-sumzeros-k-means-pca