# Machine Learning Capstone

***

## Project Description

In this activity, you will showcase your ability to use Python for model building and data analysis. You will deploy different models to analyze a dataset and generate business insights for your stakeholders. In particular, you will build and evaluate a logistic regression model or the following machine learning models: decision tree, random forest, XGBoost. You will also update your stakeholders through an executive summary, demonstrating your ability to organize and communicate key information. 

As a specialist in data analysis, the leadership team leaves it up to you to choose an approach for building the most effective model to predict employee departure. For example, you could build and evaluate a statistical model such as logistic regression. Or, you could build and evaluate machine learning models such as decision tree, random forest, and XGBoost. Or, you could choose to deploy both statistical and machine learning models. 

For any approach, you’ll need to analyze the key factors driving employee turnover, build an effective model, and share recommendations for next steps with the leadership team. 

## Data Dictionary

| Field          | Description                                                                           |
|----------------|---------------------------------------------------------------------------------------|
| satisfaction_level |	The employee’s self-reported satisfaction level [0-1]|
| last_evaluation|	Score of employee's last performance review [0-1]|
| number_project |	Number of projects employee contributes to |
| average_monthly_hours |	Average number of hours employee worked per month |
| time_spend_company | How long the employee has been with the company (years)	|
| work_accident | Whether or not the employee experienced an accident while at work	|
| promotion_last_5years | Whether or not the employee was promoted in the last 5 years	|
| department |	The employee's department |
| salary |	The employee's salary (low, medium, or high) |
| left | Whether or not the employee left the company |

## Import Libraries

In [None]:
import numpy as np
from numpy import count_nonzero, median, mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random


import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
# import researchpy as rp

# import shap
# import eli5
# from IPython.display import display


import datetime
from datetime import datetime, timedelta

#import os
#import zipfile
import scipy.stats
from collections import Counter

#import pandas_profiling
#from pandas_profiling import ProfileReport

#import graphviz
#import sweetviz

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import to_graphviz, plot_importance

import sklearn
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, KFold, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
#from sklearn.pipeline import Pipeline
#from sklearn.feature_selection import RFE, RFECV, SelectKBest, f_classif, f_regression, chi2
#from sklearn.inspection import permutation_importance

from sklearn.tree import export_graphviz, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve, accuracy_score
from sklearn.metrics import auc, f1_score, precision_score, recall_score, roc_auc_score

#from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, plot_tree
#from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
#from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor



%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)


#from tpot import TPOTClassifier, TPOTRegressor
#from imblearn.under_sampling import RandomUnderSampler
#from imblearn.over_sampling import RandomOverSampler
#from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

# import pickle
# from pickle import dump, load


pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.2f}'.format)


random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

In [None]:
df = pd.read_csv("train.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
df.describe(include='all')

In [None]:
df.shape

In [None]:
df.columns

### Create a random dataset for Hyperparameter Tuning

In [None]:
df_random = df.sample(n=3500)

In [None]:
df_random

***

### Train Test Split

In [None]:
X = df.iloc[:,:18]
y = df.iloc[:,18]

In [None]:
X.values, y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Feature Scaling

In [None]:
X_train

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_scaled

In [None]:
X_test_scaled

## Model Training

### Logistic Regression

In [None]:
lr = LogisticRegressionCV(cv=5, verbose=1, random_state=0)

In [None]:
lr.fit(X_train_scaled, y_train)

In [None]:
lrpred = logreg.predict(X_test_scaled)

In [None]:
lrpred

### LR Model Evaluation

In [None]:
lrcm = confusion_matrix(y_test,lrpred)
lrcm

In [None]:
fig , ax = plt.subplots(figsize=(10,5))
sns.heatmap(lrcm, annot=True,fmt='.4g',linewidths=2, cmap='viridis')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,lrpred))

In [None]:
plot_roc_curve(logreg,X_test_scaled,y_test)
plt.show()

## Model Tuning using Random Data

### Using RandomSearchCV

In [None]:
X_random = df_random.iloc[:,:18]
y_random = df_random.iloc[:,18]

In [None]:
X_random.values, y_random.values

In [None]:
X_random_train, X_random_test, y_random_train, y_random_test = train_test_split(X_random, y_random, test_size=0.2, random_state=0, stratify=y_random)

In [None]:
X_random_train.shape, X_random_test.shape, y_random_train.shape, y_random_test.shape

***

In [None]:
dt = DecisionTreeClassifier(random_state=0)

In [None]:
parameters = { 'criterion': ['gini', 'entropy', 'log_loss'],
               'max_depth': np.arange(2,10,2),
               'min_samples_split': np.arange(2,20,3),
               'min_samples_leaf':np.arange(1,10,2)
             }

In [None]:
randm = RandomizedSearchCV(estimator=dt, param_distributions = parameters, cv = 5, n_iter = 10, 
                           n_jobs=-1, scoring='accuracy')

In [None]:
randm.fit(X_random, y_random)

In [None]:
randm.best_estimator_

In [None]:
randm.best_score_

In [None]:
randm.best_params_

### Decision Tree Model

In [None]:
dt = DecisionTreeClassifier(criterion='entropy', min_samples_split=14, min_samples_leaf=5, max_depth=6)

In [None]:
dt.fit(X_train_scaled, y_train)

In [None]:
dtpred = dt.predict(X_test_scaled)

In [None]:
dtpred

### DT Model Evaluation

In [None]:
dtcm = confusion_matrix(y_test,dtpred)
dtcm

In [None]:
fig , ax = plt.subplots(figsize=(10,5))
sns.heatmap(dtcm, annot=True,fmt='.4g',linewidths=2, cmap='viridis')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,dtpred))

In [None]:
plot_roc_curve(dt,X_test_scaled,y_test)
plt.show()

### Plot Tree

In [None]:
X.columns

In [None]:
plt.figure(figsize=(40,25))
plot_tree(dt, feature_names=X.columns,class_names=['0','1'], fontsize=14, filled=True)
plt.show()

***

### Using RandomSearchCV

In [None]:
rf = RandomForestClassifier(random_state=0)

In [None]:
parameters = { 'criterion': ['gini', 'entropy'],
               'n_estimators' : np.arange(50,300,50),
               'max_depth': np.arange(2,10,2),
               'min_samples_split': np.arange(2,20,3),
               'min_samples_leaf':np.arange(1,10,2)
             }

In [None]:
randm = RandomizedSearchCV(estimator=rf, param_distributions = parameters, cv = 5, n_iter = 10, 
                           n_jobs=-1, scoring='accuracy')

In [None]:
randm.fit(X_random, y_random)

In [None]:
randm.best_estimator_

In [None]:
randm.best_score_

In [None]:
randm.best_params_

### Random Forest Model

In [None]:
rf = RandomForestClassifier(criterion='entropy', max_depth=8, min_samples_leaf=3,
                       min_samples_split=8, n_estimators=250, random_state=0)

In [None]:
rf.fit(X_train_scaled, y_train)

In [None]:
rfpred = rf.predict(X_test_scaled)

In [None]:
rfpred

### RF Model Evaluation

In [None]:
rfcm = confusion_matrix(y_test,rfpred)
rfcm

In [None]:
fig , ax = plt.subplots(figsize=(10,5))
sns.heatmap(rfcm, annot=True,fmt='.4g',linewidths=2, cmap='viridis')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,rfpred))

In [None]:
plot_roc_curve(rf,X_test_scaled,y_test)
plt.show()

### Cross-Validation

In [None]:
cv = cross_val_score(rf,X,y,cv=5,verbose=1,scoring='accuracy')

In [None]:
cv.mean()

### Table Formatted View

In [None]:
table = X_test.copy()

In [None]:
table["True Value"] = y_test.copy()

In [None]:
table["Predicted"] = np.round(rfpred)

In [None]:
table

### Test Data Prediction

In [None]:
testdata = pd.read_csv("test2.csv")

In [None]:
testdata.head()

In [None]:
answer = rf.predict(testdata)

In [None]:
answer[0:10]

In [None]:
answer = pd.Series(answer)

### Comparing with test data original

In [None]:
testdata2 = pd.read_csv("test.csv")

In [None]:
testdata2.head()

In [None]:
original = testdata2["left"]
original

In [None]:
type(original)

In [None]:
table2 = pd.concat([original,answer], axis=1, names=['Original','Predicted'])

In [None]:
table2

### Using XGBoost (Scikit-Learn)

### Using RandomSearchCV

In [None]:
xgc = XGBClassifier(random_state=0, n_estimators=100, objective='binary:logistic')

In [None]:
parameters = {'max_depth': np.arange(3,10,1),
              'eta': np.arange(0.05,0.3,0.05),
              'n_estimators':np.arange(50,300,50),
              'min_child_weight': np.arange(1,5,1),
              'gamma':np.arange(0,10,2),
              'subsample':np.arange(0.5,0.9,0.1),
              'colsample_bytree':np.arange(0.5,0.9,0.1),
              'reg_alpha':np.arange(0,1,0.1),
              'reg_lambda':np.arange(0,1,0.1)
             }

In [None]:
randm = RandomizedSearchCV(estimator=xgc, param_distributions = parameters, cv = 5, n_iter = 20, 
                           n_jobs=-1, scoring='accuracy')

In [None]:
randm.fit(X_random, y_random)

In [None]:
randm.best_estimator_

In [None]:
randm.best_score_

In [None]:
randm.best_params_

### Final Model

In [None]:
xgbmodel = XGBClassifier(random_state=0, subsample = 0.5, reg_lambda = 0.4, reg_alpha = 0.5,
                         n_estimators = 250, min_child_weight = 1,  max_depth = 7,
                         gamma = 0, eta = 0.1, colsample_bytree = 0.6, objective='binary:logistic')

In [None]:
xgbmodel.fit(X_train_scaled,y_train,eval_set=[(X_test_scaled,y_test)],eval_metric='error',early_stopping_rounds=10)

In [None]:
xgb_pred = xgbmodel.predict(X_test_scaled)

In [None]:
xgb_pred

### Model Evaluation

In [None]:
xgbcm = confusion_matrix(y_test,xgb_pred)
xgbcm

In [None]:
fig , ax = plt.subplots(figsize=(10,5))
sns.heatmap(xgbcm, annot=True,fmt='.4g',linewidths=2, cmap='viridis')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print(classification_report(y_test,xgb_pred))

In [None]:
plot_roc_curve(xgbmodel,X_test_scaled,y_test)
plt.show()

### Plot Feature Importances

In [None]:
xgbmodel.feature_importances_

In [None]:
feat_importances = pd.Series(xgbmodel.feature_importances_, index=X.columns)

In [None]:
feat_importances

In [None]:
feat_importances.nlargest(10).plot(kind='barh', figsize=(10,10))
plt.title('Feature Importances')
plt.show()

### Available importance_types = [‘weight’, ‘gain’, ‘cover’, ‘total_gain’, ‘total_cover’]

In [None]:
X.columns

In [None]:
xgbmodel.get_booster().feature_names = ['satisfaction_level', 'last_evaluation', 'number_project', 
                                        'average_monthly_hours', 'time_spend_company', 'work_accident_1', 
                                        'promotion_last_5years_1', 'department_RandD', 'department_accounting', 
                                        'department_hr', 'department_management', 'department_marketing', 
                                        'department_product_mng', 'department_sales', 'department_support', 
                                        'department_technical', 'salary_low', 'salary_medium']

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
xgb.plot_importance(xgbmodel.get_booster(),ax=ax)
plt.show()

### Cross-Validation

In [None]:
cv = cross_val_score(xgbmodel,X,y,cv=5,verbose=1,scoring='accuracy')

In [None]:
cv.mean()

### Save the Model

In [None]:
filename = 'model.sav'
dump(xgbnew,open(filename,'wb'))

### Load the Model

In [None]:
loaded_model = load(open(filename,'rb'))

In [None]:
loaded_model

#### Python code done by Dennis Lam