In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import gc

In [11]:
def missingValCounter(df):
    count = (df == -1).astype(int).sum(axis=1)
    return count

In [13]:
print('loading files...')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
low_variance_col = ["ps_ind_11_bin", "ps_ind_13_bin", "ps_ind_12_bin", "ps_ind_18_bin", "ps_car_10_cat", 
                    "ps_car_11_cat"]
train = train.drop(col_to_drop, axis=1)
test = test.drop(col_to_drop, axis=1)
missValsTrain = missingValCounter(train)
missValsTest = missingValCounter(test)

train["ps_car_12"] = train["ps_car_12"].apply(lambda x: round(x**2, 4) * 10000)
train["ps_car_13"] = train["ps_car_13"].apply(lambda x: round(x**2 * 48400, 2))
train["ps_car_15"] = train["ps_car_15"].apply(lambda x: round(x**2))

test["ps_car_12"] = test["ps_car_12"].apply(lambda x: round(x**2, 4) * 10000)
test["ps_car_13"] = test["ps_car_13"].apply(lambda x: round(x**2 * 48400, 2))
test["ps_car_15"] = test["ps_car_15"].apply(lambda x: round(x**2))

missValsTrain = missValsTrain.to_frame("missingVals")
missValsTest = missValsTest.to_frame("missingVals")

train["missing_values"] = missValsTrain["missingVals"].values
test["missing_values"] = missValsTest["missingVals"].values

#train = train.drop(low_variance_col, axis=1)
#test = test.drop(low_variance_col, axis=1)

loading files...


In [14]:
cat_cols = train.columns[train.columns.str.endswith("_cat")]
cat_col_val = train[cat_cols]
cat_col_val = cat_col_val + 1

cat_col_test = test.columns[test.columns.str.endswith("_cat")] 
cat_col_val_test = test[cat_col_test]
cat_col_val_test = cat_col_val_test + 1

encTrain = OneHotEncoder()
#train.drop(cat_cols)
encTrain.fit(cat_col_val)
oneHotVal = encTrain.transform(cat_col_val).toarray()

encTest = OneHotEncoder()
#test.drop(cat_col_test)
encTest.fit(cat_col_val_test)
oneHotValTest = encTest.transform(cat_col_val_test).toarray()

test = test.drop(cat_col_test, axis=1)
train = train.drop(cat_cols, axis=1)

In [15]:
new_features_train = pd.DataFrame(oneHotVal)
new_features_train.rename(columns=lambda x: x+1, inplace=True)
new_features_train.rename(columns=lambda x: "feature_" + str(x), inplace=True)
train = pd.concat([train, new_features_train], axis=1)

new_features_test = pd.DataFrame(oneHotValTest)
new_features_test.rename(columns=lambda x: x+1, inplace=True)
new_features_test.rename(columns=lambda x: "feature_" + str(x), inplace=True)
test = pd.concat([test, new_features_test], axis=1)

In [16]:
lst = [i for i in range(1,len(new_features_train.columns))]
for i in lst:
    new_features_train[str(i)] = new_features_train["feature_" + str(i)].apply(lambda x: int(x))
for i in lst:
    new_features_test[str(i)] = new_features_test["feature_" + str(i)].apply(lambda x: int(x))

In [17]:
# all zeros
del train["feature_77"]
del test["feature_77"]

In [18]:
for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

(595212, 209) (892816, 208)


In [19]:
def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_xgb(pred, y):
    y = y.get_label()
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(preds, dtrain):
    y = list(dtrain.get_label())
    score = gini(y, preds) / gini(y, y)
    return 'gini', score, True

In [20]:
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values

In [30]:
# xgb
params = {'eta': 0.01, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}


sub=test['id'].to_frame()
sub['target']=0

nrounds=2000
kfold = 5

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
gc.collect()

# lgb
params = {'metric': 'auc', 'learning_rate' : 0.1, 'max_depth':10, 'max_bin':10,  'objective': 'binary', 
          'feature_fraction': 0.8,'bagging_fraction':0.9,'bagging_freq':10,  'min_data': 500}

skf = StratifiedKFold(n_splits=kfold, random_state=1)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' lgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_eval = X[train_index], X[test_index]
    y_train, y_eval = y[train_index], y[test_index]
    lgb_model = lgb.train(params, lgb.Dataset(X_train, label=y_train), nrounds, 
                  lgb.Dataset(X_eval, label=y_eval), verbose_eval=100, 
                  feval=gini_lgb, early_stopping_rounds=100)
    
    sub['target'] += lgb_model.predict(test[features].values, 
                        num_iteration=lgb_model.best_iteration) / (2*kfold)

log_model = LogisticRegression()
log_model.fit(train[features].values, y)
sub['target'] += log_model.predict_proba(test[features].values)[:,1] / (2*kfold)

sub.to_csv('sub10log.csv', index=False, float_format='%.5f') 
gc.collect()

 xgb kfold: 1  of  5 : 
[0]	train-gini:0.214602	valid-gini:0.214817
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.


KeyboardInterrupt: 

In [25]:
test[features].values.shape

(892816, 207)

In [29]:
sub.max()

id        1.488026e+06
target    1.174328e+00
dtype: float64

In [48]:
 xgb kfold: 1  of  5 : 
[0]	train-gini:0.199908	valid-gini:0.194267
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.260527	valid-gini:0.250359
[200]	train-gini:0.269496	valid-gini:0.253377
[300]	train-gini:0.282427	valid-gini:0.258978
[400]	train-gini:0.297443	valid-gini:0.266783
[500]	train-gini:0.310735	valid-gini:0.271728
[600]	train-gini:0.321808	valid-gini:0.275316
[700]	train-gini:0.331675	valid-gini:0.278089
[800]	train-gini:0.340027	valid-gini:0.279733
[900]	train-gini:0.347624	valid-gini:0.281073
[1000]	train-gini:0.354826	valid-gini:0.281934
[1100]	train-gini:0.361346	valid-gini:0.282563
[1200]	train-gini:0.367924	valid-gini:0.282746

SyntaxError: invalid syntax (<ipython-input-48-62ab42495b2c>, line 1)