[View in Colaboratory](https://colab.research.google.com/github/gowtham91m/Analytics_Vidhya_hackathon/blob/master/WNS_Analytics_Wizard.ipynb)

# **Problem Statement **

Date : Sept 13 2018

Your client is a large MNC and they have 9 broad verticals across the organisation. One of the problem your client is facing is
around identifying the right people for promotion (only for manager position and below) and prepare them in time. Currently the
process, they are following is:
1. They first identify a set of employees based on recommendations/ past performance
2. Selected employees go through the separate training and evaluation program for each vertical. These programs are based
on the required skill of each vertical
3. At the end of the program, based on various factors such as training performance, KPI completion (only employees with
KPIs completed greater than 60% are considered) etc., employee gets promotion

For above mentioned process, the final promotions are only announced after the evaluation and this leads to delay in transition to
their new roles. Hence, company needs your help in identifying the eligible candidates at a particular checkpoint so that they can
expedite the entire promotion cycle.

**Solution: **

Voting Classifier of three models, Randomforest, ExtraTreeCalssifier and XGBoost is used for final submission. 
There is clear over fitting in Randomforest and ExtraTreeClassifier, there is still scope for model tuning.



In [0]:
#!pip install lightgbm
#import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import f1_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import PredefinedSplit
from sklearn.cross_validation import StratifiedKFold,cross_val_score
from time import time
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,VotingClassifier
import xgboost as xgb
from sklearn import svm
from sklearn.neighbors import RadiusNeighborsClassifier,KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

import os
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning)
warnings.filterwarnings(module='sklearn*', action='ignore', category=FutureWarning)

In [0]:
%%capture
!pip install PyDrive
#!pip install -U imbalanced-learn
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
train = drive.CreateFile({'id': '154jKYkO-NwxrRaOiPSx2unXUAQq2qXgD'})
test = drive.CreateFile({'id': '1k8UPlZ4WDD49En566qeaGqJrvSt5ywbB'})
  
train_impute = drive.CreateFile({'id': '1jDo4Lduatl3BeujV9AwneJRHm1lCptdE'})
test_impute = drive.CreateFile({'id': '1JcTolhhpF_pTKM9Nv0yfiYOK86wxRhXW'})

train.GetContentFile('train.csv')
test.GetContentFile('test.csv')

train_impute.GetContentFile('train_impute.csv')
test_impute.GetContentFile('test_impute.csv')

In [0]:
def submit_file(model,test,test_id):
  #test_id = test.employee_id
  #test = test.drop(['employee_id'],axis=1)
  pred = model.predict(test)
  pred = [0 if i <0.5 else 1 for i in pred]
  
  submit = pd.DataFrame({'employee_id':test_id,'is_promoted':pred})
  submit.to_csv('submit.csv',index=False)
  files.download('submit.csv')

## grid search cv
def report(results, n_top=3):
   
    i=1
    candidates = np.flatnonzero(results['rank_test_score'] == i)
    for candidate in candidates:

        print("Mean train score: {0:.3f} (std: {1:.3f})".format(
              results['mean_train_score'][candidate],
              results['std_train_score'][candidate]))

        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              results['mean_test_score'][candidate],
              results['std_test_score'][candidate]))

        print("Parameters: {0}".format(results['params'][candidate]))
        print("")

def gridsearchcv(xtrain,ytrain,model,params, fit_params = None):
  grid = GridSearchCV(model
                      ,params
                      ,scoring = 'f1'
                      ,fit_params = fit_params
                      ,cv=StratifiedKFold(ytrain,n_folds=3, shuffle=True,random_state=3)
                      ,verbose = 1
                      ,refit=True
                      ,return_train_score = True
                     )
  grid.fit(xtrain,ytrain)
  
  report(grid.cv_results_)
  return grid

def randomgridcv(xtrain,ytrain,model,params,  fit_params = None):
  
  grid = RandomizedSearchCV(model
                           ,params
                           ,n_jobs=5
                           ,cv=StratifiedKFold(ytrain,n_folds=3, shuffle=False,random_state=3)
                           ,scoring='f1'
                           ,verbose=1
                           ,refit=True
                           ,return_train_score = True)
  grid.fit(xtrain,ytrain)
  
  report(grid.cv_results_)
  return grid

# grid search on custom validation data to fecilitate the data preprocessing only on training data and apply to validatoin data
def gridsearch(X,y,model,params, scoring = 'f1', fit_params = None):
  x_train, x_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = .3,  random_state=12)
  ss = StandardScaler()
  x_train = ss.fit_transform(x_train)
  x_val = ss.transform(x_val) 
  
  X = x_train.append(x_val,axis=0)
  y = y_train.append(y_val)
  
  my_test_fold= [-1] * len(y_train) + [0] * len(y_val) 
  
  
  grid = GridSearchCV(model
                      ,params
                      ,scoring = scoring
                      ,fit_params = fit_params
                      ,cv = PredefinedSplit(test_fold=my_test_fold)
                      ,verbose = 1
                      ,refit=True
                      ,return_train_score = True
                     )
  grid.fit(X,y)
  
  report(grid.cv_results_)
  return grid

def randomgrid(X,y,model,params, scoring = 'f1', fit_params = None):
  x_train, x_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = .3,  random_state=12)
  
  ss = StandardScaler()
  x_train = ss.fit_transform(x_train)
  x_val = ss.transform(x_val)
  
  X = np.append(x_train,x_val,axis=0)
  y = y_train.append(y_val)
  #y = np.append(y_train,y_val,axis=0)
  
  my_test_fold= [-1] * len(y_train) + [0] * len(y_val) 
  
  grid = RandomizedSearchCV(model, params
                   ,n_jobs=5
                   ,cv = PredefinedSplit(test_fold=my_test_fold)
                   ,scoring=scoring
                   ,verbose=1
                   ,refit=True
                   ,return_train_score = True)
  grid.fit(X,y)
  
  report(grid.cv_results_)
  return grid

In [0]:
#path = 'C:\\Users\\gmallikarjuna\\Documents\\Gowtham\\data\\av\\wns'
#os.chdir(path)
#train = pd.read_csv('train_impute.csv')
#test = pd.read_csv('test_impute.csv')

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = pd.get_dummies(train,drop_first=True)
test = pd.get_dummies(test,drop_first=True)

test_id = test.employee_id
test = test.drop(['employee_id'],axis=1)

train.previous_year_rating.fillna(train.previous_year_rating.value_counts().reset_index().iloc[0,0],inplace=True)
test.previous_year_rating.fillna(test.previous_year_rating.value_counts().reset_index().iloc[0,0],inplace=True)

X=train.drop(['is_promoted','employee_id'],axis=1)
y=train['is_promoted']


ss = StandardScaler()
X= ss.fit_transform(X)
test = ss.transform(test)

x_train, x_val, y_train, y_val = train_test_split(X, y,stratify = y, test_size = .3,  random_state=12)

In [12]:
params={
    'max_depth': [3,4,5,6,7,8,9],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
    ,'subsample': [0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'colsample_bytree': [0.5,0.6,0.7,0.8],
    'n_estimators': [100,300,350,400,450,500,550,600,700]
    ,'booster':['gbtree','gblinear','dart']
    ,'gamma':[0.001,0.01,0.1,0.3]
    ,'min_child_weight':[2,3,4,5]
    ,'reg_alpha':[0.01, 0.02, 0.03, 0.04]
    
}

xgb_clf =xgb.XGBClassifier(objective= 'binary:logistic',scale_pos_weight=0.9)
t = time()
clf = randomgridcv(X,y,xgb_clf,params) 
print(time()-t)
#submit_file(clf,test,test_id)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed: 45.1min finished


Mean train score: 0.513 (std: 0.002)
Mean validation score: 0.503 (std: 0.009)
Parameters: {'subsample': 1.0, 'reg_alpha': 0.02, 'n_estimators': 550, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0.001, 'colsample_bytree': 0.8, 'booster': 'dart'}

2939.4818251132965


In [13]:
param_dist = {"max_depth": [ 2,3,None],
              "max_features": ['auto',None],
              "min_samples_split": [4,6,8,9,10],
              "min_samples_leaf": [2,3,4,5],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
             "n_estimators":[25,35,40,50,60,70,100,150],
             }

model=ExtraTreesClassifier(class_weight='balanced',random_state=0)
et_clf = randomgridcv(X,y,model,param_dist)
#submit_file(et_clf,test,test_id)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:  2.4min finished


Mean train score: 0.814 (std: 0.005)
Mean validation score: 0.481 (std: 0.007)
Parameters: {'n_estimators': 70, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}



In [14]:
param_dist = {"max_depth": [ 2,3,None],
              "max_features": ['auto',None],
              "min_samples_split": [2,4,5,6,8,10],
              "min_samples_leaf": [1,2,3,4],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
             "n_estimators":[25,35,50,60,70,80,100,150],
             }

model=RandomForestClassifier(class_weight='balanced',random_state=0)
rf_clf = randomgridcv(X,y,model,param_dist)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=5)]: Done  30 out of  30 | elapsed:  1.5min finished


Mean train score: 0.882 (std: 0.004)
Mean validation score: 0.478 (std: 0.003)
Parameters: {'n_estimators': 80, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': None, 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}



In [0]:
xgb_classifier=xgb.XGBClassifier(objective= 'binary:logistic'
                                 ,scale_pos_weight=0.9
                                 ,subsample= 1.0
                                 ,reg_alpha= 0.02
                                 ,n_estimators= 550
                                 ,min_child_weight=3
                                 ,max_depth= 3
                                 ,learning_rate =0.1
                                 , gamma= 0.001
                                 ,colsample_bytree= 0.8
                                 ,booster= 'dart')

In [0]:
et_classifier = ExtraTreesClassifier(class_weight='balanced'
                                     ,random_state=0
                                     ,n_estimators= 70
                                     ,min_samples_split= 8
                                     ,min_samples_leaf= 4
                                     ,max_features= None
                                     ,max_depth= None
                                     ,criterion= 'gini'
                                     ,bootstrap= True
)


In [0]:
rf_classifier=RandomForestClassifier(class_weight='balanced'
                                     ,random_state=0
                                     ,n_estimators= 80
                                     ,min_samples_split= 10
                                     ,min_samples_leaf= 4
                                     ,max_features= None
                                     ,max_depth= None
                                     ,criterion = 'gini'
                                     ,bootstrap =True
)

In [21]:
model = VotingClassifier(estimators=[('et', et_classifier), ('rf', rf_classifier),('xgb',xgb_classifier)], voting='soft',flatten_transform=True)
model.fit(x_train,y_train)
model.score(x_val,y_val)

0.9368120172717874

In [24]:
pred = model.predict(x_val)
f1_score(y_val,pred)

0.5371937639198219

In [25]:
model = VotingClassifier(estimators=[('et', et_classifier), ('rf', rf_classifier),('xgb',xgb_classifier)], voting='soft',flatten_transform=True)
model.fit(X,y)

VotingClassifier(estimators=[('et', ExtraTreesClassifier(bootstrap=True, class_weight='balanced',
           criterion='gini', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=4,
           min_samples_split=8, min_weig...pha=0.02,
       reg_lambda=1, scale_pos_weight=0.9, seed=None, silent=True,
       subsample=1.0))],
         flatten_transform=True, n_jobs=1, voting='soft', weights=None)

In [0]:
submit_file(model,test,test_id)