In [1]:
from __future__ import division

# import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.svm import OneClassSVM
from sklearn.model_selection import GridSearchCV



In [2]:
df_train = pd.read_csv('new_train.csv')

In [3]:
print (Counter(df_train['TARGET']))

Counter({0: 57665, 1: 2355})


In [4]:
names = df_train.columns.values.tolist()[:-1]

In [5]:
df_test = pd.read_csv('test.csv', names=names)

In [6]:
# process base
def process_base(train, test):
    train.loc[(train['var38']>117310.979) & (train['var38']<117310.98), 'var38'] = -999.0
    test.loc[(test['var38']>117310.979) & (test['var38']<117310.98), 'var38'] = -999.0

    train.loc[train['var3']==-999999, 'var3'] = -999.0
    test.loc[test['var3']==-999999, 'var3'] = -999.0

    for f in ['imp_op_var40_comer_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var41_comer_ult3', 'imp_sal_var16_ult1']:
        train.loc[train[f]==0.0, f] = -999.0
        test.loc[test[f]==0.0, f] = -999.0
        
    return train, test

In [7]:
df_train, df_test = process_base(df_train, df_test)

In [8]:
# remove constant columns
remove = []
for col in df_train.columns:
    if df_train[col].std() == 0:
        remove.append(col)

df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [9]:
# remove duplicated columns
remove = []
c = df_train.columns
for i in range(len(c)-1):
    v = df_train[c[i]].values
    for j in range(i+1,len(c)):
        if np.array_equal(v,df_train[c[j]].values):
            remove.append(c[j])
df_train.drop(remove, axis=1, inplace=True)
df_test.drop(remove, axis=1, inplace=True)

In [10]:
# insert sum zeros feature
flist = [x for x in df_train.columns if not x in ['ID','TARGET']]
df_train.insert(1, 'SumZeros', (df_train[flist] == 0).astype(int).sum(axis=1))
df_test.insert(1, 'SumZeros', (df_test[flist] == 0).astype(int).sum(axis=1))

In [11]:
def normalize_features(train, test):
    flist = [x for x in train.columns if not x in ['ID','TARGET']]
    for f in flist:
        if train[f].max() == 9999999999.0:
            fmax = train.loc[train[f]<9999999999.0, f].max()
            train.loc[train[f]==9999999999.0, f] = fmax + 1

        if len(train.loc[train[f]<0, f].value_counts()) == 1:
            train.loc[train[f]<0, f] = -1.0
            test.loc[test[f]<0, f] = -1.0
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f]) == 0:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train.loc[train[f]>0, f] = 1.0*train.loc[train[f]>0, f]/fmax
                test.loc[test[f]>0, f] = 1.0*test.loc[test[f]>0, f]/fmax

        if len(train.loc[train[f]<0, f].value_counts()) > 1:
            fmax = max(np.max(train[f]), np.max(test[f]))
            if fmax > 0:
                train[f] = 1.0*train[f]/fmax
                test[f] = 1.0*test[f]/fmax

    return train, test


In [12]:
df_train, df_test = normalize_features(df_train, df_test)

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
x_train_projected = pca.fit_transform(df_train.drop(['TARGET'], axis=1).values)
x_test_projected = pca.transform(df_test.values)
df_train.insert(1, 'PCAOne', x_train_projected[:, 0])
df_train.insert(1, 'PCATwo', x_train_projected[:, 1])
df_test.insert(1, 'PCAOne', x_test_projected[:, 0])
df_test.insert(1, 'PCATwo', x_test_projected[:, 1])

In [14]:
# from sklearn.manifold import TSNE
# from sklearn.decomposition import TruncatedSVD
# from sklearn.preprocessing import StandardScaler

# X = df_train.drop(['TARGET'], axis=1).append(df_test).values
# svd = TruncatedSVD(n_components=30)

# len_train = df_train.values.shape[0]

# X_svd = svd.fit_transform(X)
# X_scaled = StandardScaler().fit_transform(X_svd)
# modelTSNE = TSNE(n_components=2,random_state=0)
# feats_tsne = modelTSNE.fit_transform(X_scaled)
# feats_tsne_train = pd.DataFrame(feats_tsne[:len_train], columns=['tsne1', 'tsne2'])
# feats_tsne_test = pd.DataFrame(feats_tsne[len_train:], columns=['tsne1', 'tsne2'])
# # feats_tsne['ID'] = train[['ID']].append(test[['ID']], ignore_index=True)['ID'].values
# df_train = pd.merge(train, feats_tsne, on='ID', how='left')
# df_test = pd.merge(test, feats_tsne, on='ID', how='left')


# # tsne = TSNE(n_components=2)

In [13]:
y_train = df_train['TARGET'].values
X_train = df_train.drop(['TARGET'], axis=1).values

X_test = df_test.values

# length of dataset
len_train = len(X_train)
len_test  = len(X_test)

In [14]:
# classifier
clf = xgb.XGBClassifier(missing=-999.0, max_depth=5, min_child_weight=5,
                        n_estimators=500, learning_rate=0.02, gamma=1.0, 
                        nthread=-1, subsample=0.8, colsample_bytree=1.0, colsample_bylevel=0.7, seed=123089)

X_fit, X_eval, y_fit, y_eval= train_test_split(X_train, y_train, test_size=0.3)

In [17]:
# param_test1 = {
# #     'max_depth':range(1,10,1),
# #     'min_child_weight':[1,2],
#     'gamma':[0.0, 0.05, 0.25, 0.75],
# #     'subsample':[0.75, 0.95],
# #     'colsample_bytree':[0.75,0.95],
# #     'reg_alpha':[0, 0.01],
#     'learning_rate':[0.003, 0.01, 0.03, 0.1, 0.3]
# }

In [18]:
# gsearch1 = GridSearchCV(estimator=xgb.sklearn.XGBClassifier(missing=np.nan, max_depth=5,
#                         min_child_weight=1,objective= 'binary:logistic', gamma=0,
#                         scale_pos_weight=1, n_estimators=350, learning_rate=0.03, 
#                         nthread=-1, subsample=0.95, colsample_bytree=0.85, seed=4242), 
# param_grid = param_test1, scoring='roc_auc',iid=False, verbose=30, n_jobs=-1, cv=5)

In [19]:
# gsearch1.fit(X_train, y_train)
# # print 'cv_results', gsearch1.cv_results_
# print ('\n')
# print ('best parameters', gsearch1.best_params_ )
# print ('best_score', gsearch1.best_score_)

In [15]:
# fitting
clf.fit(X_train, y_train, early_stopping_rounds=20, eval_metric="auc", eval_set=[(X_eval, y_eval)])

[0]	validation_0-auc:0.807998
Will train until validation_0-auc hasn't improved in 20 rounds.
[1]	validation_0-auc:0.82185
[2]	validation_0-auc:0.82322
[3]	validation_0-auc:0.827017
[4]	validation_0-auc:0.829212
[5]	validation_0-auc:0.832035
[6]	validation_0-auc:0.832015
[7]	validation_0-auc:0.834061
[8]	validation_0-auc:0.832787
[9]	validation_0-auc:0.833811
[10]	validation_0-auc:0.835116
[11]	validation_0-auc:0.836065
[12]	validation_0-auc:0.836788
[13]	validation_0-auc:0.836137
[14]	validation_0-auc:0.83633
[15]	validation_0-auc:0.836607
[16]	validation_0-auc:0.836936
[17]	validation_0-auc:0.836495
[18]	validation_0-auc:0.836795
[19]	validation_0-auc:0.836624
[20]	validation_0-auc:0.836751
[21]	validation_0-auc:0.837105
[22]	validation_0-auc:0.837652
[23]	validation_0-auc:0.838193
[24]	validation_0-auc:0.83851
[25]	validation_0-auc:0.83916
[26]	validation_0-auc:0.839583
[27]	validation_0-auc:0.839763
[28]	validation_0-auc:0.839763
[29]	validation_0-auc:0.839692
[30]	validation_0-auc

[259]	validation_0-auc:0.863645
[260]	validation_0-auc:0.863746
[261]	validation_0-auc:0.863746
[262]	validation_0-auc:0.863907
[263]	validation_0-auc:0.864001
[264]	validation_0-auc:0.864101
[265]	validation_0-auc:0.864146
[266]	validation_0-auc:0.864214
[267]	validation_0-auc:0.864191
[268]	validation_0-auc:0.864357
[269]	validation_0-auc:0.864497
[270]	validation_0-auc:0.864611
[271]	validation_0-auc:0.864665
[272]	validation_0-auc:0.864763
[273]	validation_0-auc:0.864749
[274]	validation_0-auc:0.864831
[275]	validation_0-auc:0.865005
[276]	validation_0-auc:0.865083
[277]	validation_0-auc:0.865179
[278]	validation_0-auc:0.865254
[279]	validation_0-auc:0.865371
[280]	validation_0-auc:0.865484
[281]	validation_0-auc:0.865505
[282]	validation_0-auc:0.865563
[283]	validation_0-auc:0.865564
[284]	validation_0-auc:0.865705
[285]	validation_0-auc:0.865856
[286]	validation_0-auc:0.865885
[287]	validation_0-auc:0.865984
[288]	validation_0-auc:0.866041
[289]	validation_0-auc:0.866124
[290]	va

XGBClassifier(base_score=0.5, colsample_bylevel=0.7, colsample_bytree=1.0,
       gamma=1.0, learning_rate=0.02, max_delta_step=0, max_depth=5,
       min_child_weight=5, missing=-999.0, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=123089, silent=True, subsample=0.8)

In [16]:
print('Overall AUC:', roc_auc_score(y_train, clf.predict_proba(X_train)[:,1]))

('Overall AUC:', 0.87886365774350472)


In [17]:
# predicting
y_pred= clf.predict_proba(X_test)[:,1]

In [18]:
submission = pd.DataFrame(data=y_pred)
submission.to_csv("submission.csv", index=False, header=None)