In [1]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold;
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix




In [2]:
data = pd.read_csv(r'D:\Documents\ML\HR_comma_sep.csv')

In [3]:
data= pd.get_dummies(data,columns =['salary','sales'])

In [5]:
front = data['left']
data.drop(labels=['left'], axis=1,inplace = True)
data.insert(0, 'left', front)
data.sample(10)

Unnamed: 0,left,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_high,salary_low,...,sales_IT,sales_RandD,sales_accounting,sales_hr,sales_management,sales_marketing,sales_product_mng,sales_sales,sales_support,sales_technical
12787,0,0.88,0.6,4,162,2,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3424,0,0.71,0.56,4,238,4,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9759,0,0.91,0.77,4,167,3,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
10538,0,0.9,1.0,4,218,2,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
13858,0,0.31,0.63,4,104,7,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2056,0,0.81,0.61,5,231,2,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
127,1,0.7,0.89,3,183,5,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
13547,0,0.85,0.67,3,176,2,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2637,0,0.5,0.49,3,214,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8335,0,0.59,0.51,2,126,3,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
X = data.drop(['left'], axis =1 )
y = data['left']
y = y.reshape((data['left'].shape[0],1))

In [7]:
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    svm.LinearSVC(),
    
    #Trees    
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
    ]


In [9]:
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%

#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

#create table to compare MLA predictions
MLA_predict = data['left']

#index through MLA and save performance to table
row_index = 0
for alg in MLA:

    #set name and parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.at[row_index, 'MLA Name'] = MLA_name
    MLA_compare.at[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
    cv_results = model_selection.cross_validate(alg, X, y, cv  = cv_split)

    MLA_compare.at[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.at[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.at[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()   
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
    MLA_compare.at[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3   #let's know the worst that can happen!
    

    #save MLA predictions - see section 6 for usage
    alg.fit(X,y)
    MLA_predict[MLA_name] = alg.predict(X)
    
    row_index+=1

    
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare
#MLA_predict

Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy Mean,MLA Test Accuracy Mean,MLA Test Accuracy 3*STD,MLA Time
0,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",1.0,0.9742,0.00427343,0.0577395
4,XGBClassifier,"{'base_score': 0.5, 'colsample_bylevel': 1, 'c...",0.977097,0.973267,0.00495356,0.415169
1,ExtraTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",1.0,0.954933,0.0157925,0.0138171
2,LinearDiscriminantAnalysis,"{'n_components': None, 'priors': None, 'shrink...",0.780476,0.778,0.022594,0.0418806
3,QuadraticDiscriminantAnalysis,"{'priors': None, 'reg_param': 0.0, 'store_cova...",0.546683,0.544311,0.594327,0.0187104


In [8]:
MLA = [#Ensemble Methods
    tree.DecisionTreeClassifier(),
    tree.ExtraTreeClassifier(),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()  ]