In [32]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

In [2]:
trainDF = pd.read_csv('data/Train_psolI3n.csv')
testDF = pd.read_csv('data/Test_09JmpYa.csv')

In [3]:
#Check the percentage of email status
trainDF['Email_Status'].value_counts() /trainDF['Email_Status'].size

0    0.803783
1    0.161500
2    0.034717
Name: Email_Status, dtype: float64

In [4]:
target = trainDF['Email_Status']
emailids = testDF['Email_ID']

#Do the tagging for train and test
trainDF['tag'] = 'train'
testDF['tag'] = 'test'

trainDF = trainDF.drop(['Email_ID','Email_Status'], axis=1)
testDF = testDF.drop('Email_ID',axis=1)

In [5]:
#Merge all the data for processing
allDF = pd.concat([trainDF,testDF],ignore_index=True)

In [6]:
allDF.describe()

Unnamed: 0,Email_Type,Subject_Hotness_Score,Email_Source_Type,Email_Campaign_Type,Total_Past_Communications,Time_Email_sent_Category,Word_Count,Total_Links,Total_Images
count,114331.0,114331.0,114331.0,114331.0,102816.0,114331.0,114331.0,110707.0,111561.0
mean,1.284096,1.095876,1.456928,2.270539,28.892225,1.997945,701.350885,10.41291,3.535178
std,0.450985,0.996247,0.498144,0.468056,12.528849,0.631809,271.875965,6.39044,5.592399
min,1.0,0.0,1.0,1.0,0.0,1.0,40.0,1.0,0.0
25%,1.0,0.2,1.0,2.0,20.0,2.0,521.0,6.0,0.0
50%,1.0,0.8,1.0,2.0,28.0,2.0,694.0,9.0,0.0
75%,2.0,1.8,2.0,3.0,38.0,2.0,880.0,14.0,5.0
max,2.0,5.0,2.0,3.0,67.0,3.0,1316.0,49.0,47.0


In [7]:
allDF.columns

Index(['Email_Type', 'Subject_Hotness_Score', 'Email_Source_Type',
       'Customer_Location', 'Email_Campaign_Type', 'Total_Past_Communications',
       'Time_Email_sent_Category', 'Word_Count', 'Total_Links', 'Total_Images',
       'tag'],
      dtype='object')

In [8]:
#Check all the null values
for col in allDF.columns:
    if(allDF[col].isnull().sum() > 0):
        print(col + ' - ' +str(allDF[col].isnull().sum()))

Customer_Location - 19438
Total_Past_Communications - 11515
Total_Links - 3624
Total_Images - 2770


In [9]:
#Do processing on each coloumn to fill the NA's
allDF['Customer_Location'].fillna('unknown', inplace=True)
allDF['Total_Past_Communications'].fillna(allDF['Total_Past_Communications'].median(), inplace=True)
allDF['Total_Links'].fillna(allDF['Total_Links'].median(), inplace=True)
allDF['Total_Images'].fillna(0, inplace=True)

In [10]:
allDF.dtypes

Email_Type                     int64
Subject_Hotness_Score        float64
Email_Source_Type              int64
Customer_Location             object
Email_Campaign_Type            int64
Total_Past_Communications    float64
Time_Email_sent_Category       int64
Word_Count                     int64
Total_Links                  float64
Total_Images                 float64
tag                           object
dtype: object

In [11]:
col_to_drop = ['Email_Type','Email_Source_Type','Customer_Location','Email_Campaign_Type','Time_Email_sent_Category']

In [12]:
#Convert customer locatation to categorial.
for col in col_to_drop:
    dummyvar = pd.get_dummies(allDF[col],prefix=col)
    allDF = pd.concat([allDF,dummyvar],axis=1)

allDF = allDF.drop(col_to_drop, axis=1)

In [13]:
#Split, train and test data
X_train = allDF[allDF['tag'] == 'train'].drop('tag',axis=1)
y_train = target
X_test = allDF[allDF['tag'] == 'test'].drop('tag',axis=1)

In [14]:
X_test.shape

(45978, 24)

In [38]:
rfc = RandomForestClassifier(n_estimators=100,
                             max_depth=5,
                             max_features=5,
                            max_leaf_nodes=2,
                            min_samples_leaf=2,
                            min_samples_split=2,
                            random_state=10)

In [39]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=5, max_leaf_nodes=2,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

In [50]:
param_test1 = {'n_estimators':list(range(50,221,20)),
              'max_depth':list(range(1,3,1)),
               'min_samples_split':list(range(50,200,25))}

In [51]:
gsearch1 = GridSearchCV(estimator = rfc, param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_, gsearch1.best_estimator_

([mean: 0.80378, std: 0.00005, params: {'n_estimators': 50, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 70, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 90, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 110, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 130, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 150, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 170, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 190, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 210, 'min_samples_split': 50, 'max_depth': 1},
  mean: 0.80378, std: 0.00005, params: {'n_estimators': 50, 'min_samples_spl

In [40]:
rfc.feature_importances_

array([ 0.06,  0.3 ,  0.04,  0.04,  0.06,  0.03,  0.05,  0.03,  0.01,
        0.  ,  0.01,  0.  ,  0.  ,  0.01,  0.  ,  0.  ,  0.  ,  0.  ,
        0.06,  0.2 ,  0.1 ,  0.  ,  0.  ,  0.  ])

In [41]:
cv_score = cross_validation.cross_val_score(rfc, X_train, y_train, cv=5, scoring='accuracy')
cv_score.mean()

0.80378330436620649

0.80455867684754045

In [22]:
status = rfc.predict(X_test)

In [23]:
submission = pd.DataFrame()

In [24]:
status

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [25]:
submission['Email_ID'] = emailids
submission['Email_Status'] = status

In [26]:
submission.to_csv('rf1.csv', index=False)