In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
trainDF = pd.read_csv('data/Train_psolI3n.csv')
testDF = pd.read_csv('data/Test_09JmpYa.csv')

# Get the id and target variable
target = trainDF['Email_Status']
emailids = testDF['id']

#Do the tagging for train and test
trainDF['tag'] = 'train'
testDF['tag'] = 'test'

trainDF = trainDF.drop(['Email_ID','Email_Status'], axis=1)
testDF = testDF.drop('Email_ID',axis=1)

In [None]:
#Merge all the data for processing
allDF = pd.concat([trainDF,testDF],ignore_index=True)

In [None]:
#Do processing on each coloumn to fill the NA's

#Check the correlation
sns.heatmap(allDF.corr())

In [None]:
#Split, train and test data
X_train = allDF[allDF['tag'] == 'train'].drop('tag',axis=1)
y_train = target
X_test = allDF[allDF['tag'] == 'test'].drop('tag',axis=1)

In [None]:
def modelfit(alg, X_train, y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        #set num_class for multi class classification problem
        xgb_param['num_class'] = 3
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='merror', early_stopping_rounds=early_stopping_rounds, verbose_eval=1)
        #print(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(X_train, y_train,eval_metric='merror')
        
    #Predict training set:
    dtrain_predictions = alg.predict(X_train)
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(y_train.values, dtrain_predictions))
    print("Confusion matrix:")
    print(metrics.confusion_matrix(y_train.values, dtrain_predictions,labels=[0,1,2]))
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
g={'ne':500,'md':10,'mf':60,'rs':2016}
xgc = XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'],
                    min_child_weight=4,
                    learning_rate=0.01, subsample=0.9, colsample_bytree=0.85,objective='multi:softmax')

modelfit(xgc,X_train,y_train)