In [2]:
import pandas as pd
import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb

# Data Preprocessing

In [3]:
Data_set = pd.read_csv('challenge_data/Data_set.csv')
Test_set = pd.read_csv('challenge_data/Test_set.csv')

In [4]:
X = Data_set.iloc[:,2:]
y = Data_set.iloc[:,1]
X_test = Test_set.iloc[:,2:]
id_test = Test_set.iloc[:,0]
X = X.fillna(0)
X_test = X_test.fillna(0)

In [5]:
def str_to_int(train, test):
    train = train.fillna(0)
    test = test.fillna(0)
    idx = 0
    for item in train.unique():
        train.loc[train==item] = idx
        test.loc[test==item] = idx
        idx += 1
    return train, test

In [6]:
string_column = []
for i,col in enumerate(X.columns):
    if X[col].dtype != np.float64 and X[col].dtype != np.int64:
        string_column.append(i)
        X[col], X_test[col] = str_to_int(X[col], X_test[col])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [8]:
print(Counter(y_train))
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)
print(Counter(y_train))

Counter({0: 76747, 1: 3439})
Counter({0: 76747, 1: 76747})


In [11]:
X_train, X_val, y_train, y_val = map(np.array, [X_train, X_val, y_train, y_val])

# Train model

In [43]:
### load data in do training
label  = np.array(y)
data   = np.array(X)
weight = np.array(np.ones(len(y)))
dtrain = xgb.DMatrix(data, label=label, missing = -999.0, weight=weight)
param = {'max_depth':6, 'eta':0.1, 'silent':1, 'objective':'binary:logitraw', 'nthread':4}
num_round = 120

In [44]:
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    wtrain = dtrain.get_weight()
    wtest = dtest.get_weight()
    sum_weight = sum(wtrain) + sum(wtest)
    wtrain *= sum_weight / sum(wtrain)
    wtest *= sum_weight / sum(wtest)
    dtrain.set_weight(wtrain)
    dtest.set_weight(wtest)
    return (dtrain, dtest, param)

running cross validation, with preprocessing function


In [None]:
# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5, seed = 0, fpreproc = fpreproc)

# Evaluate test set

In [47]:
X_test = np.array(X_test)
xg_test = xgb.DMatrix(X_test)
pred_prob = bst.predict(xg_test).reshape(X_test.shape[0], 2)
pred_label = np.argmax(pred_prob, axis=1)

In [56]:
id_test = np.array(id_test).reshape(-1,1)
pred_label = np.array(pred_label).reshape(-1, 1)

In [60]:
answer = pd.DataFrame(np.concatenate([id_test, pred_label], axis=1))

In [65]:
answer

Unnamed: 0,0,1
0,100,0
1,188,0
2,269,0
3,323,1
4,397,0
5,453,0
6,513,0
7,678,1
8,724,0
9,728,1


In [66]:
answer.to_csv('challenge_data/Answer_sheet.csv', index=False, header=False)