In [None]:
import numpy as np
import pandas as pd
from pandas import *
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams

In [None]:
df = pd.read_csv('trend_v1_.csv', encoding='utf-8')
training_set = pd.read_csv('training-set.csv', encoding='utf-8',
                           names=['FileID', 'Label'])
df_training = pd.merge(df, training_set, on=['FileID'], how='inner')
# Why does this not work? 
# dataset = pd.read_csv(r'C:\Users\David\Desktop\trend_v1_.csv', encoding='utf-8')

In [None]:
# Imputation of variables
total_time = 3600*24*90
df_training['QueryTsIntervalMean'].fillna(value=total_time, inplace=True)
df_training['QueryTsIntervalStd'].fillna(value=0, inplace=True) 
df_training.to_csv('trend_v2.csv', index=False)
feature = list(df_training)
#df_training.info()
#df_training.head()

In [None]:
# Define X and y 
y = df_training['Label']
X = df_training.drop(['FileID','Label'],1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import roc_auc_score, mean_absolute_error, r2_score
from xgboost import XGBClassifier
### http://xgboost.readthedocs.io/en/latest/python/python_api.html

In [None]:
# XGBoost Classifier
# split our data into training & test
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=7, test_size=0.33)
eval_set = [(X_val, y_val)]
model_xgb = XGBClassifier(max_depth=3, learning_rate=0.1, n_estimators=100, reg_alpha=0.1, reg_lambda=1)
model_xgb.fit(X_train, y_train, early_stopping_rounds=10, eval_metric='auc', eval_set=eval_set, verbose=1)
# For auc, see https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
# model.predict will return the predicted label
y_pred = model_xgb.predict(X_val)
y_pred_proba = model_xgb.predict_proba(X_val)[:,1]
roc_auc = metrics.roc_auc_score(y_val, y_pred_proba)
print('roc_auc: %.2f%%' % roc_auc)

In [None]:
# Show the model statistics
mae = metrics.mean_absolute_error(y_val, y_pred_proba)
r2 = metrics.r2_score(y_val, y_pred_proba)
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_pred_proba, pos_label=1, drop_intermediate=True)
print('mae: %.2f%%' % mae)
print('r2: %.2f%%' % r2)
print('FPR:',fpr)
print('TPR:',tpr)
print('thresholds:',thresholds)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.scatter(fpr,tpr)
plt.show()

In [None]:
# Import the plot_importance function to visualize the feature importance
from xgboost import plot_importance
print(model_xgb.feature_importances_)
rcParams['figure.figsize'] = 6,10
plot_importance(model_xgb, max_num_features=30)
plt.show()

In [None]:
# Plot tree
from xgboost import plot_tree
rcParams['figure.figsize'] = 50,80
plot_tree(model_xgb, num_trees=1)
plt.show()

In [None]:
# Create submission
testing_set = pd.read_csv('testing-set.csv', encoding='utf-8', 
                          names=['FileID','Label'])
dft = pd.merge(df, testing_set, on=['FileID'], how='inner')
dft2 = dft.drop(['FileID','Label'],1)
# Create submission
y_predt = model_xgb.predict_proba(dft2)[:,1]
y_predt = pd.DataFrame(y_predt, columns=['Label'])
submission = pd.concat([pd.DataFrame(dft['FileID']), y_predt], axis=1)
submission.to_csv('trend_submission.csv', index=False, header=False)
#submission['Label].value_counts()

In [None]:
# Grid Search
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.model_selection import GridSearchCV
parameters= {'max_depth':[3,4], 'learning_rate':[0.1], 
                'n_estimators':[100], 'reg_alpha':[0,0.1,1,10],
                'reg_lambda':[0,0.1,1,10]}
model = XGBClassifier()
best_model = GridSearchCV(model, parameters, cv=5, scoring='roc_auc')
best_model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric='auc', eval_set=eval_set, verbose=1)

In [None]:
best_model.cv_results_