In [2]:
import pandas as pd
import numpy as np
np.random.seed(0)

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb

# Data Preprocessing

In [36]:
Data_set = pd.read_csv('challenge_data/Data_set.csv')
Test_set = pd.read_csv('challenge_data/Test_set.csv')

In [43]:
X = Data_set.iloc[:,2:]
y = Data_set.iloc[:,1]
X_test = Test_set.iloc[:,2:]
id_test = Test_set.iloc[:,0]
X = X.fillna(0)
X_test = X_test.fillna(0)

In [44]:
def str_to_int(train, test):
    train = train.fillna(0)
    test = test.fillna(0)
    idx = 0
    for item in train.unique():
        train.loc[train==item] = idx
        test.loc[test==item] = idx
        idx += 1
    return train, test

In [45]:
string_column = []
for i,col in enumerate(X.columns):
    if X[col].dtype != np.float64 and X[col].dtype != np.int64:
        string_column.append(i)
        X[col], X_test[col] = str_to_int(X[col], X_test[col])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [19]:
print(Counter(y_train))
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_sample(X_train, y_train)
print(Counter(y_train))

Counter({0: 76747, 1: 3439})
Counter({0: 76747, 1: 76747})


In [20]:
#X_train, X_val, y_train, y_val = map(np.array, [X_train, X_val, y_train, y_val])

# Train model

In [21]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_val = xgb.DMatrix(X_val, label=y_val)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 2

In [22]:
watchlist = [(xg_train, 'train'), (xg_test, 'test')]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist)
# get prediction
pred = bst.predict(xg_val)
error_rate = np.sum(pred != y_val) / y_val.shape[0]
print('Test error using softmax = {}'.format(error_rate))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist)
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
pred_prob = bst.predict(xg_val).reshape(y_val.shape[0], 2)
pred_label = np.argmax(pred_prob, axis=1)
error_rate = np.sum(pred != y_val) / y_val.shape[0]
print('Test error using softprob = {}'.format(error_rate))
print("f1_score", f1_score(pred_label, y_val, average=None))
print(confusion_matrix(pred_label, y_val))

[0]	train-merror:0.08773	test-merror:0.087893
[1]	train-merror:0.067312	test-merror:0.087295
[2]	train-merror:0.069416	test-merror:0.087445
[3]	train-merror:0.067729	test-merror:0.08949
[4]	train-merror:0.063559	test-merror:0.083753
Test error using softmax = 0
[0]	train-merror:0.08773	test-merror:0.087893
[1]	train-merror:0.067312	test-merror:0.087295
[2]	train-merror:0.069416	test-merror:0.087445
[3]	train-merror:0.067729	test-merror:0.08949
[4]	train-merror:0.063559	test-merror:0.083753
Test error using softprob = 0
('f1_score', array([ 0.95520875,  0.35645841]))
[[17903   383]
 [ 1296   465]]


# Evaluate test set

In [47]:
X_test = np.array(X_test)
xg_test = xgb.DMatrix(X_test)
pred_prob = bst.predict(xg_test).reshape(X_test.shape[0], 2)
pred_label = np.argmax(pred_prob, axis=1)

In [56]:
id_test = np.array(id_test).reshape(-1,1)
pred_label = np.array(pred_label).reshape(-1, 1)

In [60]:
answer = pd.DataFrame(np.concatenate([id_test, pred_label], axis=1))

In [65]:
answer

Unnamed: 0,0,1
0,100,0
1,188,0
2,269,0
3,323,1
4,397,0
5,453,0
6,513,0
7,678,1
8,724,0
9,728,1


In [66]:
answer.to_csv('challenge_data/Answer_sheet.csv', index=False, header=False)