In [None]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
trainDF = pd.read_csv('data/Train_psolI3n.csv')
testDF = pd.read_csv('data/Test_09JmpYa.csv')

In [None]:
#Check the percentage of email status
trainDF['Email_Status'].value_counts() /trainDF['Email_Status'].size

In [None]:
target = trainDF['Email_Status']
emailids = testDF['Email_ID']

#Do the tagging for train and test
trainDF['tag'] = 'train'
testDF['tag'] = 'test'

trainDF = trainDF.drop(['Email_ID','Email_Status'], axis=1)
testDF = testDF.drop('Email_ID',axis=1)

In [None]:
#Merge all the data for processing
allDF = pd.concat([trainDF,testDF],ignore_index=True)

In [None]:
allDF.describe()

In [None]:
allDF.columns

In [None]:
#Check all the null values
for col in allDF.columns:
    if(allDF[col].isnull().sum() > 0):
        print(col + ' - ' +str(allDF[col].isnull().sum()))

In [None]:
#Do processing on each coloumn to fill the NA's
allDF['Customer_Location'].fillna('unknown', inplace=True)
allDF['Total_Past_Communications'].fillna(allDF['Total_Past_Communications'].median(), inplace=True)
allDF['Total_Links'].fillna(allDF['Total_Links'].median(), inplace=True)
allDF['Total_Images'].fillna(0, inplace=True)

In [None]:
sns.heatmap(allDF.corr())

In [None]:
col_to_drop = ['Email_Type','Email_Source_Type','Customer_Location','Email_Campaign_Type','Time_Email_sent_Category']
drop_cols_overfitting = ['Total_Links']

In [None]:
#Convert customer locatation to categorial.
for col in col_to_drop:
    dummyvar = pd.get_dummies(allDF[col],prefix=col)
    allDF = pd.concat([allDF,dummyvar],axis=1)

allDF = allDF.drop(col_to_drop, axis=1)
allDF = allDF.drop(drop_cols_overfitting, axis=1)

In [None]:
allDF.columns

In [None]:
#Split, train and test data
X_train = allDF[allDF['tag'] == 'train'].drop('tag',axis=1)
y_train = target
X_test = allDF[allDF['tag'] == 'test'].drop('tag',axis=1)

In [None]:
X_test.shape

In [None]:
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        #set num_class for multi class classification problem
        xgb_param['num_class'] = 3
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='merror', early_stopping_rounds=early_stopping_rounds, verbose_eval=1)
        #print(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='merror')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y_train.values, dtrain_predictions))
    print("Confusion matrix:")
    print(metrics.confusion_matrix(y_train.values, dtrain_predictions,labels=[0,1,2]))
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
g={'ne':500,'md':10,'mf':60,'rs':2016}
xgc = XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
                    min_child_weight=4,
                    learning_rate=0.01, subsample=0.9, colsample_bytree=0.85,objective='multi:softmax')
#xgc = XGBClassifier(n_estimators=g['ne'])
#objective='multi:softmax'

In [None]:
#xgc.fit(X_train, y_train)

In [None]:
#cv_score = cross_validation.cross_val_score(xgc, X_train, y_train, cv=5, scoring='accuracy')
#cv_score.mean()

In [None]:
modelfit(xgc,X_train,y_train)

In [None]:
xgc.get_params()

In [None]:
param_test1 = {
 'max_depth': list(range(6,13,1)),
 'min_child_weight':list(range(3,7,1))
}
gsearch1 = GridSearchCV(estimator = xgc, 
 param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch1 = GridSearchCV(estimator = xgc, 
 param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test4 = {
 'subsample':[i/100.0 for i in list(range(55,75,5))],
    'colsample_bytree':[i/100.0 for i in range(70,100,5)]
}
gsearch1 = GridSearchCV(estimator = xgc, 
 param_grid = param_test4, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
xgc = XGBClassifier(n_estimators=500, max_depth=7, seed=2016,
                    min_child_weight=6,
                    learning_rate=0.01, subsample=0.6, colsample_bytree=0.8,objective='multi:softmax')

In [None]:
modelfit(xgc,X_train,y_train)

In [None]:
status = xgc.predict(X_test)

In [None]:
submission = pd.DataFrame()

In [None]:
status

In [None]:
submission['Email_ID'] = emailids
submission['Email_Status'] = status

In [None]:
submission.to_csv('xgb1.csv', index=False)