<H3>Load Libraries</H3>

In [1]:
import os

mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']

import numpy as np
import scipy as sp
import matplotlib as mpl
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

<H3>Load dataset</H3>

In [2]:
train = pd.read_csv('Train_pjb2QcD.csv')
test = pd.read_csv('Test_wyCirpO.csv')

In [3]:
print train.shape
print test.shape

(9527, 23)
(5045, 22)


<H3>Cleansing</H3>

In [4]:
test_id = test['ID']
train.drop(['ID'], axis = 1, inplace = True)
test.drop(['ID'], axis = 1, inplace = True)

In [5]:
train.drop(['Application_Receipt_Date'], axis = 1, inplace = True)
test.drop(['Application_Receipt_Date'], axis = 1, inplace = True)

In [6]:
app_dob = []
for i in xrange (0,train.shape[0]):
    if str(train.Applicant_BirthDate[i]) == 'nan':
        app_dob.append(0)
    else:
        app_dob.append(int(train.Applicant_BirthDate[i][-4:]))
print max(set(app_dob), key=app_dob.count)

1982


In [7]:
train.Applicant_BirthDate = train.Applicant_BirthDate.fillna('1/1/1982')
test.Applicant_BirthDate = test.Applicant_BirthDate.fillna('1/1/1982')
train['Age'] = train['Applicant_BirthDate'].apply(lambda x: 2016 - int(x[-4:]))
test['Age'] = test['Applicant_BirthDate'].apply(lambda x: 2016 - int(x[-4:]))

train.drop(['Applicant_BirthDate'], axis = 1, inplace = True)
test.drop(['Applicant_BirthDate'], axis = 1, inplace = True)

In [8]:
man_dob = []
for i in xrange (0,train.shape[0]):
    if str(train.Manager_DoB[i]) == 'nan':
        man_dob.append(0)
    else:
        man_dob.append(int(train.Manager_DoB[i][-4:]))
print max(set(man_dob), key=man_dob.count)

1974


In [9]:
train.Manager_DoB = train.Manager_DoB.fillna('1/1/1974')
test.Manager_DoB = test.Manager_DoB.fillna('1/1/1974')
train['Man_age'] = train['Manager_DoB'].apply(lambda x: 2016 - int(x[-4:]))
test['Man_age'] = test['Manager_DoB'].apply(lambda x: 2016 -int(x[-4:]))

train.drop(['Manager_DoB'], axis = 1, inplace = True)
test.drop(['Manager_DoB'], axis = 1, inplace = True)

In [10]:
man_doj = []
for i in xrange (0,train.shape[0]):
    if str(train.Manager_DOJ[i]) == 'nan':
        man_doj.append(0)
    else:
        man_doj.append(int(train.Manager_DOJ[i][-4:]))
print max(set(man_doj), key=man_doj.count)

2007


In [11]:
train.Manager_DOJ = train.Manager_DOJ.fillna('1/1/2007')
test.Manager_DOJ = test.Manager_DOJ.fillna('1/1/2007')
train['Man_ex'] = train['Manager_DOJ'].apply(lambda x: 2016 - int(x[-4:]))
test['Man_ex'] = test['Manager_DOJ'].apply(lambda x: 2016 -int(x[-4:]))

train.drop(['Manager_DOJ'], axis = 1, inplace = True)
test.drop(['Manager_DOJ'], axis = 1, inplace = True)

In [12]:
train.apply(lambda x: sum(x.isnull()))

Office_PIN                        0
Applicant_City_PIN               97
Applicant_Gender                 67
Applicant_Marital_Status         73
Applicant_Occupation           1221
Applicant_Qualification          86
Manager_Joining_Designation     683
Manager_Current_Designation     683
Manager_Grade                   683
Manager_Status                  683
Manager_Gender                  683
Manager_Num_Application         683
Manager_Num_Coded               683
Manager_Business                683
Manager_Num_Products            683
Manager_Business2               683
Manager_Num_Products2           683
Business_Sourced                  0
Age                               0
Man_age                           0
Man_ex                            0
dtype: int64

In [13]:
test.loc[test['Applicant_Qualification']=='Associate/Fellow of Institute of Institute of Costs and Works Accountants of India', 'Applicant_Qualification'] = 'Associate/Fellow of Institute of Company Secretories of India'

In [14]:
data_cols = list(train.columns)
remove_cols = ['Office_PIN', 'Business_Sourced', 'Age', 'Man_age', 'Man_ex']
new_cols = [x for x in data_cols if x not in remove_cols]

In [15]:
for v in new_cols:
    print '\nFrequency count for variable %s'%v
    print train[v].value_counts()


Frequency count for variable Applicant_City_PIN
202001.0    103
492001.0     75
305001.0     64
452001.0     55
476001.0     51
281001.0     49
125001.0     48
285001.0     47
803101.0     46
274001.0     46
277001.0     45
845305.0     44
250001.0     42
431001.0     41
121004.0     41
462001.0     40
121001.0     40
201301.0     39
122001.0     39
474001.0     39
208001.0     37
834001.0     34
444601.0     33
210001.0     33
271201.0     32
277303.0     32
121002.0     30
533101.0     30
444606.0     29
224122.0     27
           ... 
206301.0      1
517583.0      1
271607.0      1
222302.0      1
422610.0      1
124021.0      1
271502.0      1
271503.0      1
173204.0      1
534320.0      1
173209.0      1
173212.0      1
173221.0      1
632001.0      1
124113.0      1
127405.0      1
271821.0      1
690524.0      1
206122.0      1
140604.0      1
281204.0      1
517501.0      1
501143.0      1
452005.0      1
206249.0      1
452011.0      1
206253.0      1
452014.0      1
271801.

In [16]:
train.dtypes

Office_PIN                       int64
Applicant_City_PIN             float64
Applicant_Gender                object
Applicant_Marital_Status        object
Applicant_Occupation            object
Applicant_Qualification         object
Manager_Joining_Designation     object
Manager_Current_Designation     object
Manager_Grade                  float64
Manager_Status                  object
Manager_Gender                  object
Manager_Num_Application        float64
Manager_Num_Coded              float64
Manager_Business               float64
Manager_Num_Products           float64
Manager_Business2              float64
Manager_Num_Products2          float64
Business_Sourced                 int64
Age                              int64
Man_age                          int64
Man_ex                           int64
dtype: object

In [17]:
train.Applicant_City_PIN.fillna(202001.0, inplace = True)
train.Applicant_Gender.fillna('M', inplace = True)
train.Applicant_Marital_Status.fillna('M', inplace = True)
train.Applicant_Occupation.fillna('Salaried', inplace = True)
train.Applicant_Qualification.fillna('Class XII', inplace = True)
train.Manager_Joining_Designation.fillna('Level 1', inplace = True)
train.Manager_Current_Designation.fillna('Level 2', inplace = True)
train.Manager_Grade.fillna(3.0, inplace = True)
train.Manager_Status.fillna('Confirmation', inplace = True)
train.Manager_Gender.fillna('M', inplace = True)
train.Manager_Num_Application.fillna(round(train.Manager_Num_Application.mean()), inplace = True)
train.Manager_Num_Coded.fillna(round(train.Manager_Num_Coded.mean()), inplace = True)
train.Manager_Business.fillna(round(train.Manager_Business.mean()), inplace = True)
train.Manager_Num_Products.fillna(round(train.Manager_Num_Products.mean()), inplace = True)
train.Manager_Business2.fillna(round(train.Manager_Business2.mean()), inplace = True)
train.Manager_Num_Products2.fillna(round(train.Manager_Num_Products2.mean()), inplace = True)

test.Applicant_City_PIN.fillna(202001.0, inplace = True)
test.Applicant_Gender.fillna('M', inplace = True)
test.Applicant_Marital_Status.fillna('M', inplace = True)
test.Applicant_Occupation.fillna('Salaried', inplace = True)
test.Applicant_Qualification.fillna('Class XII', inplace = True)
test.Manager_Joining_Designation.fillna('Level 1', inplace = True)
test.Manager_Current_Designation.fillna('Level 2', inplace = True)
test.Manager_Grade.fillna(3.0, inplace = True)
test.Manager_Status.fillna('Confirmation', inplace = True)
test.Manager_Gender.fillna('M', inplace = True)
test.Manager_Num_Application.fillna(round(train.Manager_Num_Application.mean()), inplace = True)
test.Manager_Num_Coded.fillna(round(train.Manager_Num_Coded.mean()), inplace = True)
test.Manager_Business.fillna(round(train.Manager_Business.mean()), inplace = True)
test.Manager_Num_Products.fillna(round(train.Manager_Num_Products.mean()), inplace = True)
test.Manager_Business2.fillna(round(train.Manager_Business2.mean()), inplace = True)
test.Manager_Num_Products2.fillna(round(train.Manager_Num_Products2.mean()), inplace = True)

In [18]:
app_sex = preprocessing.LabelEncoder()
train.Applicant_Gender = app_sex.fit_transform(train.Applicant_Gender)
test.Applicant_Gender = app_sex.transform(test.Applicant_Gender)

In [19]:
app_sta = preprocessing.LabelEncoder()
train.Applicant_Marital_Status = app_sta.fit_transform(train.Applicant_Marital_Status)
test.Applicant_Marital_Status = app_sta.transform(test.Applicant_Marital_Status)

In [20]:
app_occ = preprocessing.LabelEncoder()
train.Applicant_Occupation = app_occ.fit_transform(train.Applicant_Occupation)
test.Applicant_Occupation = app_occ.transform(test.Applicant_Occupation)

In [21]:
app_qua = preprocessing.LabelEncoder()
train.Applicant_Qualification = app_qua.fit_transform(train.Applicant_Qualification)
test.Applicant_Qualification = app_qua.transform(test.Applicant_Qualification)

In [22]:
man_d0 = preprocessing.LabelEncoder()
train.Manager_Joining_Designation = man_d0.fit_transform(train.Manager_Joining_Designation)
test.Manager_Joining_Designation = man_d0.transform(test.Manager_Joining_Designation)

In [23]:
man_d1 = preprocessing.LabelEncoder()
train.Manager_Current_Designation = man_d1.fit_transform(train.Manager_Current_Designation)
test.Manager_Current_Designation = man_d1.transform(test.Manager_Current_Designation)

In [24]:
man_sta = preprocessing.LabelEncoder()
train.Manager_Status = man_sta.fit_transform(train.Manager_Status)
test.Manager_Status = man_sta.transform(test.Manager_Status)

In [25]:
man_sex = preprocessing.LabelEncoder()
train.Manager_Gender = man_sex.fit_transform(train.Manager_Gender)
test.Manager_Gender = man_sex.transform(test.Manager_Gender)

In [26]:
y = train.pop('Business_Sourced')
X = train

<H3>Split train and validation</H3>

In [335]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgval = xgb.DMatrix(X_val, label=y_val)
xgtest = xgb.DMatrix(test)

<H3>Run Model</H3>

In [336]:
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03
params["min_child_weight"] = 1
params["subsample"] = 0.6
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 6
params["eval_metric"] = 'auc'
params["colsample_bytree"] = 0.6
params["lambda"] = 50

plst = list(params.items())
num_rounds = 1000
model = xgb.train(plst, xgtrain, num_rounds)

train_pred = model.predict(xgtrain)
val_pred = model.predict(xgval)
test_pred = model.predict(xgtest)

print (roc_auc_score(y_train, train_pred))
print (roc_auc_score(y_val, val_pred))

0.860824303701
0.645575335397


<H3>Create submission file</H3>

In [337]:
submission = pd.DataFrame({"ID":test_id, "Business_Sourced":test_pred})
submission.to_csv("submission_0725.csv", index=False)

In [352]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)
xgtrain = xgb.DMatrix(X_train, label=y_train)
xgval = xgb.DMatrix(X_val, label=y_val)
xgtest = xgb.DMatrix(test)

In [353]:
params = {}
params["objective"] = "binary:logistic"
params["eta"] = 0.03
params["min_child_weight"] = 1
params["subsample"] = 0.6
params["scale_pos_weight"] = 1.0
params["silent"] = 0
params["max_depth"] = 2
params["eval_metric"] = 'auc'
params["colsample_bytree"] = 0.6
params["lambda"] = 25

plst = list(params.items())
num_rounds = 1000
model = xgb.train(plst, xgtrain, num_rounds)

train_pred = model.predict(xgtrain)
val_pred = model.predict(xgval)
test_pred = model.predict(xgtest)

print (roc_auc_score(y_train, train_pred))
print (roc_auc_score(y_val, val_pred))

0.714349530057
0.625016044629
