## Data Loading

In [None]:
import io
import zipfile
import pandas as pd
import pprint
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


pp = pprint.PrettyPrinter(indent=4).pprint

data_file_path = '../data/ibm-hr-analytics-attrition-dataset.zip'
encoding = 'utf-8-sig'


data = []
with zipfile.ZipFile(data_file_path) as zfile:
    for name in zfile.namelist():
        with zfile.open(name) as readfile:
            for line in io.TextIOWrapper(readfile, encoding):
                data.append(line.replace('\n', '').split(','))

labels=['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 
       'DistanceFromHome', 'Education', 'EducationField', 'EducationField'
       "EmployeeCount","EmployeeNumber","EnvironmentSatisfaction","Gender","HourlyRate","JobInvolvement",
       "JobLevel","JobRole","JobSatisfaction","MaritalStatus","MonthlyIncome","MonthlyRate","NumCompaniesWorked",
       "Over18","OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction","StandardHours",
       "StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany",
       "YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"
      ]

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
attrition_df = pd.DataFrame(data, columns=labels)
attrition_df = attrition_df.drop([0])


attrition_df

## Data Discovery

In [None]:
# attrition_df
attrition_df.head()

# len(attrition_df.columns)

not_categorical_data = [
    "Attrition",
    "BusinessTravel",
    "Department",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "OverTime",
    "Over18"
]

# for data in not_categorical_data:
#     print(data , ':', attrition_df[data].unique())
    
pre_categorized_data = ["Education",
"EnvironmentSatisfaction",
"JobInvolvement",
"JobSatisfaction",
"PerformanceRating",
"RelationshipSatisfaction",
"WorkLifeBalance", 
"Gender",
"JobRole",
"StockOptionLevel"
]

#finding categorized data that has no signficance, has only one data value
# for data in pre_categorized_data:
#     if ( len(attrition_df[data].unique()) <= 1):
#         print(data , ':', attrition_df[data].unique())
        
# #looking at all pre_categorized data unique values
# for data in pre_categorized_data:
#     print(data , ':', attrition_df[data].unique())
    
#finding not_categorical_data that has no signficance, has only one data value
# for data in not_categorical_data:
#     if ( len(attrition_df[data].unique()) <= 1):
#         print(data , ':', attrition_df[data].unique())


#### Pre-Categorized Categorical Data

Education
1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'

EnvironmentSatisfaction
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

JobInvolvement 
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

JobSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

PerformanceRating 
1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'

RelationshipSatisfaction 
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

WorkLifeBalance 1 'Bad' 2 'Good' 3 'Better' 4 'Best'

In [None]:
# NOT CATEGORIZED CATEGORICAL DATA
post_categorical_data = [
    "Attrition",
    "BusinessTravel",
    "Department",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "OverTime"
]



## CHANGING CATEGORICAL DATA TO NUMBERS
# VERSION 1
# categorical_data_values_count = {}
# for data in not_categorical_data:
#     #get counts for data
#     categorical_data_values_count[data] = attrition_df[data].value_counts()
#     #change data to number
#     attrition_df[data] = attrition_df[data].factorize()[0]
    
# attrition_df


#VERSION 2
# iris['species_num'] = iris.species.map({'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2})

attrition_df["AttritionBool"] = attrition_df.Attrition.map({"Yes": 1, "No": 0})



attrition_df["TravelRarelyBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Travel_Rarely"]
# attrition_df["Travel_Rarely"] = attrition_df.BusinessTravel.map({"Travel_Rarely": 1, "Travel_Frequently": 2, "Non-Travel": 0})




In [None]:
# attrition_df.columns
#

In [None]:
# attrition_df["TravelRarelyBool"]

In [None]:
# attrition_df.Travel_RarelyBool.map({'Travel_Rarely':0, np.nan:1})



In [None]:
attrition_df["TravelFrequentlyBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Travel_Frequently"]
attrition_df["NonTravelBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Non-Travel"]


attrition_df["SalesDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Sales"]
attrition_df["ResearchAndDevelopmentDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Research & Development"]
attrition_df["HumanResourcesDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Human Resources"]

# # attrition_df["Department"] = attrition_df.Department.map({"Sales": 0, "Research & Development": 1, "Human Resources": 2})

attrition_df["LifeScienceEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Life Sciences']
attrition_df["OtherEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Other']
attrition_df["MedicalEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Medical']
attrition_df["MarketingEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Marketing']
attrition_df["TechnicalEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Technical Degree']
attrition_df["HumanResourcesEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Human Resources']
# attrition_df["EducationField"] = attrition_df.EducationField.map({'Life Sciences': 0, 'Other': 1, 'Medical': 2, 'Marketing': 3, 'Technical Degree': 4,
# #  'Human Resources': 5})


# attrition_df["Gender"] = attrition_df.Gender.map({"Male": 0, "Female": 1})
attrition_df["Male"] = attrition_df.Gender[attrition_df.Gender == 'Male']
attrition_df["Female"] = attrition_df.Gender[attrition_df.Gender == 'Female']



attrition_df["SalesExecutiveBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Executive']
attrition_df["ResearchScientistBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Scientist']
attrition_df["LaboratoryTechnicianBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Laboratory Technician']
attrition_df["ManufacturingDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manufacturing Director']
attrition_df["HealthcareRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Healthcare Representative']
attrition_df["ManagerBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manager']
attrition_df["SalesRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Representative']
attrition_df["ResearchDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Director']
attrition_df["HumanResourcesBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Human Resources']


# # attrition_df["JobRole"] = attrition_df.JobRole.map({'Sales Executive': 0, 'Research Scientist': 1,
# #     'Laboratory Technician': 2, 'Manufacturing Director': 3, 'Healthcare Representative': 4, 'Manager': 5,
# #  'Sales Representative': 6, 'Research Director': 7, 'Human Resources': 8})



attrition_df["DivorcedBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Divorced']
attrition_df["SingleBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Single']
attrition_df["MarriedBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Married']
# # attrition_df["MaritalStatus"] = attrition_df.MaritalStatus.map({'Single': 0, 'Married': 1, 'Divorced': 2})

attrition_df["OverTimeBool"] = attrition_df.OverTime.map({"Yes": 1, "No": 0})


# attrition_df["OverTime"] = attrition_df.OverTime.map({"Yes": 1, "No": 0})

# # attrition_df["Over18"] = attrition_df.Over18.map({"Y": 1, "N": 0})

# # attrition_df

# # # attrition_df.Education


# # # for cat_data in attrition_df[pre_categorized_data + post_categorical_data]:
# # # #     print(cat_data, ' : ',attrition_df[cat_data].mode)
# # #     print(cat_data)
    
# # # attrition_df[pre_categorized_data + post_categorical_data].Education.mode()
# attrition_df


In [None]:
# attrition_df["TravelRarelyBool"]
attrition_df["TravelRarelyBool"] = attrition_df["TravelRarelyBool"].map({"Travel_Rarely": 1, np.nan: 0})
# attrition_df["TravelRarelyBool"]

attrition_df["NonTravelBool"] = attrition_df["NonTravelBool"].map({"Non-Travel": 1, np.nan: 0})
attrition_df["TravelFrequentlyBool"] = attrition_df["TravelFrequentlyBool"].map({"Travel_Frequently": 1, np.nan: 0})

# # attrition_df["NonTravelBool"]


attrition_df["SalesDepartmentBool"] = attrition_df["SalesDepartmentBool"].map({"Sales": 1, np.nan: 0})
attrition_df["ResearchAndDevelopmentDepartmentBool"] = attrition_df["ResearchAndDevelopmentDepartmentBool"].map({"Research & Development": 1, np.nan: 0})
attrition_df["HumanResourcesDepartmentBool"] = attrition_df["HumanResourcesDepartmentBool"].map({"Human Resources": 1, np.nan: 0})


attrition_df["LifeScienceEducationBool"] = attrition_df["LifeScienceEducationBool"].map({"Life Sciences": 1, np.nan: 0})
attrition_df["OtherEducationBool"] = attrition_df["OtherEducationBool"].map({"Other": 1, np.nan: 0})
attrition_df["MedicalEducationBool"] = attrition_df["MedicalEducationBool"].map({"Medical": 1, np.nan: 0})
attrition_df["MarketingEducationBool"] = attrition_df["MarketingEducationBool"].map({"Marketing": 1, np.nan: 0})
attrition_df["TechnicalEducationBool"] = attrition_df["TechnicalEducationBool"].map({"Technical Degree": 1, np.nan: 0})
attrition_df["HumanResourcesEducationBool"] = attrition_df["HumanResourcesEducationBool"].map({"Human Resources": 1, np.nan: 0})

attrition_df["Male"] = attrition_df.Male.map({"Male": 1, np.nan: 0})
attrition_df["Female"] = attrition_df.Female.map({"Female": 1, np.nan: 0})



# attrition_df["SalesExecutiveBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Executive']
# attrition_df["ResearchScientistBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Scientist']
# attrition_df["LaboratoryTechnicianBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Laboratory Technician']
# attrition_df["ManufacturingDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manufacturing Director']
# attrition_df["HealthcareRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Healthcare Representative']
# attrition_df["ManagerBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manager']
# attrition_df["SalesRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Representative']
# attrition_df["ResearchDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Director']
# attrition_df["HumanResourcesBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Human Resources']


attrition_df["SalesExecutiveBool"] = attrition_df["SalesExecutiveBool"].map({"Sales Executive": 1, np.nan: 0})
attrition_df["ResearchScientistBool"] = attrition_df["ResearchScientistBool"].map({"Research Scientist": 1, np.nan: 0})
attrition_df["LaboratoryTechnicianBool"] = attrition_df["LaboratoryTechnicianBool"].map({"Laboratory Technician": 1, np.nan: 0})
attrition_df["ManufacturingDirectorBool"] = attrition_df["ManufacturingDirectorBool"].map({"Manufacturing Director": 1, np.nan: 0})
attrition_df["HealthcareRepresentativeBool"] = attrition_df["HealthcareRepresentativeBool"].map({"Healthcare Representative": 1, np.nan: 0})
attrition_df["ManagerBool"] = attrition_df["ManagerBool"].map({"Manager": 1, np.nan: 0})
attrition_df["SalesRepresentativeBool"] = attrition_df["SalesRepresentativeBool"].map({"Sales Representative": 1, np.nan: 0})
attrition_df["ResearchDirectorBool"] = attrition_df["ResearchDirectorBool"].map({"Research Director": 1, np.nan: 0})
attrition_df["HumanResourcesBool"] = attrition_df["HumanResourcesBool"].map({"Human Resources": 1, np.nan: 0})

# attrition_df



attrition_df["DivorcedBool"] = attrition_df["DivorcedBool"].map({"Divorced": 1, np.nan: 0})
attrition_df["SingleBool"] = attrition_df["SingleBool"].map({"Single": 1, np.nan: 0})
attrition_df["MarriedBool"] = attrition_df["MarriedBool"].map({"Married": 1, np.nan: 0})

# attrition_df.JobRole.unique()
attrition_df

In [None]:
#GET MODE FOR ALL CATEGORICAL DATA
attrition_df[pre_categorized_data + post_categorical_data].mode().iloc[0]

In [None]:
#FIND SERIES WITH NO VARIANCE/SINGLE VALUE -> THESE SERIES DO NOT HOLD PREDICTIVE VALUE AND CAN BE DROPPED
for series in attrition_df.columns:
        if (len(attrition_df[series].unique()) <= 1): print(series, ' : ', attrition_df[series].unique()) 

In [None]:
attrition_df = attrition_df.drop(columns=["EducationFieldEmployeeCount", "Over18", "StandardHours"])


# attrition_df.Over18

# attrition_df



In [None]:
#ALL NONCATEGORICAL DATA
attrition_df.head

len(list(attrition_df))

list(attrition_df)
##mean range median mode(mrm) this is to get mean, range, mode and median for all noncategorical data
series_mrm = [
    'Age',
    'DailyRate',
    'DistanceFromHome',
    'EnvironmentSatisfaction',
    'HourlyRate',
    "JobSatisfaction",
    "MonthlyIncome",
    "MonthlyRate",
    "NumCompaniesWorked",
    "PercentSalaryHike",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "StockOptionLevel",
    "TotalWorkingYears",
    "TrainingTimesLastYear",
    "WorkLifeBalance",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager"
]




In [None]:
attrition_df.applymap(type).eq(str).all()

In [None]:
# #converts all data types to floats

# for series in attrition_df.columns:
#     if series == "DailyRate":
#         print('found series')
#         print(attrition_df[series].dtype)
#     if  attrition_df[series].dtype != 'object':
# #         attrition_df[series] = attrition_df[series].astype(float)
#           if series == 'AttritionBool':
#             print('found ' + 'AttritionBool' )
#           attrition_df[series] = pd.to_numeric(series)
#     if series == "DailyRate":
#         print(type(attrition_df[series]))
# #     print(attrition_df[series].dtype)
# #     if  attrition_df[series].dtype == 'object':
# #         print('is object')

# # attrition_df.head()

# attrition_df.applymap(type).eq(str).all()

In [None]:
#confirms that data types are no longer strings
# attrition_df.applymap(type).eq(str).all()

#assert

# attrition_df.Attrition.head()

In [None]:

# attrition_df[series_mrm].mean()
# attrition_df.Age.mean()

# type(attrition_df.Age.iloc[0])

#convert all data from string type to float


attrition_df[series_mrm].mean()
attrition_df[series_mrm].mode()
attrition_df[series_mrm].median()
# attrition_df[series_mrm].range()

# range = attrition_df.max() - attrition_df.min()

# range = (range ^ 2) / range


# range

### CONFIRM NULL VALUES

### Change Categorical Data To Numeric

### Boolean Data

'Attrition' 

Will be converted from 'yes' & 'no' to 1 & 0 respectively

In [None]:
attrition_df.isnull().sum()


In [None]:
# attrition_df.columns = attrition_df.columns.str.lower()
attrition_df

In [None]:
#CONFIRM NULL VALUES
attrition_df.isnull().values.any().sum()
print("Missing Values, Detail:", '\n', attrition_df.isnull().sum())
print('Total Missing Values:', attrition_df.isnull().sum().sum())

In [None]:
attrition_df.columns

In [None]:
attrition_df.dtypes

In [None]:
attrition_df.shape

In [None]:
attrition_df.values

In [None]:
attrition_df.info()

In [None]:
attrition_df['Education']

In [None]:
attrition_df.head()

In [None]:
attrition_df.Education.describe()

In [None]:
#prints the mean of non categorical data
# for column in attrition_df:
#     if not column in [pre_categorized_data + post_categorical_data]:
#         pp(column)
#         pp(attrition_df[column].mean())

In [None]:
attrition_df.BusinessTravel.unique()

In [None]:
#confirms that data types are no longer strings
attrition_df.applymap(type).eq(str).all()
# attrition_df.YearsWithCurrManager
attrition_df.TravelFrequentlyBool

In [None]:
attrition_df.DailyRate

In [None]:
attrition_df.head(1)

list(attrition_df.columns)


In [None]:
attrition_df.head()

# Hypothesis

### I predict that

- age
- education
- environmentsatisfaction
- monthlyrate
- hourlyrate
- dailyrate

will be factors/features that will be predictive in finding employee attrition

## Visualization

In [None]:
# hypothesized_predictors = [
# "Age",
# "Education",
# "EnvironmentSatisfaction",
# "MonthlyRate",
# "HourlyRate",
# "DailyRate"
# ]

# # EnvironmentSatisfaction
# # 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
# data_env_sat = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}
# data_env_sat_names = list(data_env_sat.keys())
# data_env_sat_values = list(data_env_sat.values())

# # Education
# # 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
# data_edu = {'Below College': 1, 'College': 2, 'Bachelor': 3, 'Master': 4, 'Doctor': 5}
# data_edu_names = list(data_edu.keys())
# data_edu_values = list(data_edu.values())

In [None]:
# # sns.regplot(y=attrition_df['AttritionBool'], x=attrition_df["DailyRate"], data=attrition_df)
# type(attrition_df["DailyRate"].unique()[0])
# # attrition_df['AttritionBool']

In [None]:
# # sns.set_style('ticks')

# sns.regplot(x=attrition_df['Attrition'], y=attrition_df["DailyRate"], data=attrition_df)

# # sns.set_style('ticks')
# # fig, ax = plt.subplots()
# # fig.set_size_inches(18.5, 10.5)
# # sns.regplot(data[:,0], data[:,1], ax=ax)
# # sns.despine()


In [None]:
# for factor in hypothesized_predictors:
#     print(factor)
#     if factor == "Education":
#         fig, axs = plt.subplots(1, 3, figsize=(9,3), sharey=True)

#         axs[0].bar(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[0].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[1].scatter(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[1].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[2].plot(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[2].get_xticklabels():
#             xtick.set_rotation(45)
        
#         fig.suptitle('Categorical Plotting of Education')

#     elif factor == "EnvironmentSatisfaction":
#         fig, axs = plt.subplots(1, 3, figsize=(9,3), sharey=True)
        
#         axs[0].bar(data_edu_names, data_edu_values)
#         for xtick in axs[0].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[1].scatter(data_edu_names, data_edu_values)
#         for xtick in axs[1].get_xticklabels():
#             xtick.set_rotation(45)
            
#         axs[2].plot(data_edu_names, data_edu_values)
#         axs[2].tick_params(axis='x', which='major', pad=115)
#         for xtick in axs[2].get_xticklabels():
#             xtick.set_rotation(45)


        
#         fig.suptitle('Categorical Plotting of Environment Satisfaction')
#     else:
#         plt.hist((attrition_df[factor]), bins=25, ec='black')
#         plt.xlabel(factor)
#         plt.ylabel('Count')
#         plt.show()

In [None]:
# plt.hist(attrition_df.Age, bins=25, ec='black')
# plt.xlabel('Age')
# plt.ylabel('Count')
# plt.show()

### Models



In [None]:
# NEW
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score, GridSearchCV
# from sklearn import cross_validation, neighbors

# OLD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import grid_search, cross_validation, neighbors

from sklearn import metrics

knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
attrition_df.columns

In [None]:
feature_cols = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'AttritionBool',
       'TravelRarelyBool', 'TravelFrequentlyBool', 'NonTravelBool',
       'SalesDepartmentBool', 'ResearchAndDevelopmentDepartmentBool',
       'HumanResourcesDepartmentBool', 'LifeScienceEducationBool',
       'OtherEducationBool', 'MedicalEducationBool', 'MarketingEducationBool',
       'TechnicalEducationBool', 'HumanResourcesEducationBool', 'Male',
       'Female', 'SalesExecutiveBool', 'ResearchScientistBool',
       'LaboratoryTechnicianBool', 'ManufacturingDirectorBool',
       'HealthcareRepresentativeBool', 'ManagerBool',
       'SalesRepresentativeBool', 'ResearchDirectorBool', 'HumanResourcesBool',
       'DivorcedBool', 'SingleBool', 'MarriedBool', 'OverTimeBool']

In [None]:
#setting features and predictors for models
X = attrition_df.drop(['Attrition', 'AttritionBool', 'OverTime', 'Department', 'BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'], axis=1)
# X = attrition_df.drop(['Attrition', 'AttritionBool', 'Over18', 'Department', 'BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'], axis=1)

y = attrition_df.AttritionBool



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
#GET BASELINE

def get_baseline():
    print(y_test.value_counts()[0] / y_test.size if (y_test.value_counts()[0] / y_test.size) > (y_test.value_counts()[1] / y_test.size) else y_test.value_counts()[1] / y_test.size)  

get_baseline()

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred_class = knn.predict(X_test)

In [None]:
y_pred_class

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
# recall, the ability to predict true positives  tp / tp + fn
metrics.recall_score(y_test,y_pred_class)
# metrics.recall_score

In [None]:
knn.predict_proba(X_test)[:,1]

In [None]:
# # #ATTEMPT AT MANUAL CROSS VALIDATION, COULD BE USEFULE FOR OTHER MODELS

# from sklearn.neighbors import KNeighborsClassifier

# def finding_best_kN_value(value=100):
#     list_of_percentages = []
#     for i in list(range(1, value)):
#         knn = KneighborsClassifier(n_neighbors=i)
#         X_train, X_test, y_train, y_test = train_test_split(X, y)
#         knn.fit(X_train, y_train)
        
#         y_pred_class = knn.predict(X_test)
        
#         if y_pred_class > max(list_of_percentages): 
#             list_of_percentages.append(metrics.accuracy_score(y_test, y_pred_class))

#     return list_of_percentages

# finding_best_kN_value()
    

In [None]:
scores = cross_val_score(knn, X, y, cv=100, scoring="accuracy")
scores

In [None]:
k = list(range(1, 101))
params = {'n_neighbors': k}
kf = cross_validation.KFold(len(attrition_df), n_folds=5)
gs = grid_search.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid=params,
    cv=kf
)

#NEW VERSION
# gs = GridSearchCV(
#     estimator = neighbors.KNeighborsClassifier(),
#     param_grid=params,
#     cv=kf,
#     return_train_score=True
# )

gs.fit(X, y)
gs.grid_scores_


# lowest_std = []
# highest_mean = []
# for values in gs.grid_scores_:
# #     print(values[1])
# #     print(type(values[1]))
# #     print(lowest_std[0][1])
# #     print(lowest_std[0])
#     if len(highest_mean) == 0:
#         highest_mean.append(values)
#     else:# values[1] > lowest_std[0][1]:
#         highest_mean.pop()
#         highest_mean.append(values)
        
#     if len(lowest_std) == 0:
#         lowest_std.append(values)
#     else:# values[1] > lowest_std[0][1]:
#         lowest_std.pop()
#         lowest_std.append(values)

In [None]:
gs

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_

In [None]:
# pd.DataFrame(gs.grid_scores_

In [None]:
# lowest_std

In [None]:
# highest_mean

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X, y)

In [None]:
print(linreg.intercept_)
print(linreg.coef_)

### Find Best Predictors


In [None]:
attrition_formatted_data

In [102]:

import itertools
from sklearn.model_selection import train_test_split
from sklearn import metrics

def train_test_rmse(feature_cols):
    X = attrition_df[feature_cols]
    y = attrition_df.AttritionBool
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    
    return np.sqrt(metrics.mean_squared_error(y_test, y_pred))


features = [col for col in attrition_df.columns if col in (['OverTimeBool', 'TotalWorkingYears', 'MonthlyIncome'])]
feature_data_list_holder= []


for feature in range(1, len(features)):
    for subset in itertools.combinations(features, feature):
        score = train_test_rmse(list(subset))
        feature_data_list_holder.append([subset, score])

feature_data_list_holder




[[('MonthlyIncome',), 0.3365711649120964],
 [('TotalWorkingYears',), 0.3337660489567392],
 [('OverTimeBool',), 0.3239614255529005],
 [('MonthlyIncome', 'TotalWorkingYears'), 0.33463412506852913],
 [('MonthlyIncome', 'OverTimeBool'), 0.3198337345310833],
 [('TotalWorkingYears', 'OverTimeBool'), 0.31624216504720976]]

### Compare to Null RootMeanSquareError

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

y_null = np.zeros_like(y_test, dtype=float)

y_null.fill(y_test.mean())
y_null

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

array([0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043478,
       0.13043478, 0.13043478, 0.13043478, 0.13043478, 0.13043

In [101]:
np.sqrt(metrics.mean_squared_error(y_test, y_null))

0.33678116053977536

## Logistic Regression - ROC curves and AUC

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1e9)

logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)

y_pred_class


### Evaluation

In [None]:
print('metrics.accuracy_score : ', metrics.accuracy_score(y_test, y_pred_class))
print('metrics.recall_score : ', metrics.recall_score(y_test,y_pred_class))
print('metrics.precision_score : ', metrics.precision_score(y_test, y_pred_class))
print('metrics.f1_score : ', metrics.f1_score(y_test, y_pred_class))
print('metrics.confusion_matrix : ', metrics.confusion_matrix(y_test, y_pred_class))

### PREDICT CONFUSION MATRIX RESULTS

In [None]:
# true negative tn, false positive fp, false negative fn, true positivie = tp
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_class).ravel()

print('TN:{} | FP:{} | FN:{} | TP:{}'.format(tn, fp, fn, tp))

In [None]:
#classification report
print(metrics.classification_report(y_test, y_pred_class))

### ROC curves and AUC

In [None]:
y_pred_prob_for_roc_auc = logreg.predict_proba(X_test)[:, 1]

# y_pred_prob = logreg.predict_proba(X_test)[:, 1]
y_pred_prob_for_roc_auc

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (8,6)
plt.rcParams['font.size'] = 14

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob_for_roc_auc)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate -> FP/(FP+TN)')
plt.ylabel('True Positivie Rate -> Recall')

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_prob_for_roc_auc))

### Done with KNN for now

In [None]:

# histogram of predicted probabilities grouped by actual response value
df = pd.DataFrame({'probability':y_pred_prob_for_roc_auc, 'actual':y_test})
df.hist(column='probability', by='actual', sharex=True, sharey=True)

### CROSS VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

In [None]:
USE ITERMETHOS TO GO THROUGH ALL COMBINATIONS AND FIND THE BEST CROSS VAL SCORE


EXAMPLE:
# # add Fare to the model
# feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S', 'Fare']
# X = titanic[feature_cols]

# # recalculate AUC
# cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

## Decision Tree -> Classification Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
treeClf = DecisionTreeClassifier(random_state=1)

DTscores = cross_val_score(treeClf, X, y, cv=14, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-DTscores))

In [None]:
DTscores

In [None]:
treeClfDep1 = DecisionTreeClassifier(max_depth=1, random_state=1)

DTscores = cross_val_score(treeClfDep1, X, y, cv=14, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-DTscores))

In [None]:
max_depth_range = range(1,8)

# list to store values
RMSE_scores = []

for depth in max_depth_range:
    treeClf = DecisionTreeClassifier(max_depth=depth, random_state=1)
    MSE_scores = cross_val_score(treeClf, X, y, cv=14, scoring="neg_mean_squared_error")
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

In [None]:
RMSE_scores

In [None]:
plt.plot(max_depth_range, RMSE_scores)
plt.xlabel('max_depth')
plt.ylabel('RMSE (lower is better)')

In [None]:
treeClf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeClf.fit(X, y)

In [None]:
# pd.DataFrame({'feature': X.columns, 'importance': treereg.feature_importances_})
values = pd.DataFrame({'feature': X.columns, 'importance': treeClf.feature_importances_}).sort_values(by='importance', ascending=False)
values.head(3)

### Tree Diagram

In [None]:
from sklearn.externals.six import StringIO
import pydot
from sklearn.tree import export_graphviz


export_graphviz(treeClf, out_file='tree_attrition.dot', feature_names=X.columns)
!dot -Tpng tree_attrition.dot -o tree_attrition.png

In [None]:
from PIL import Image
Image.open('tree_attrition.png')

### Using The Classification Tree

In [None]:
# pd.DataFrame(attrition_df["AttritionBool"])
# pd.DataFrame(y)

In [None]:
X_test = X
y_test = y
y_pred = treeClf.predict(X_test)
y_pred

# X_test.OverTimeBool

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))
#i don't know if this is good or bad