## Data Loading

In [223]:
import io
import zipfile
import pandas as pd
import pprint
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


pp = pprint.PrettyPrinter(indent=4).pprint

data_file_path = '../data/ibm-hr-analytics-attrition-dataset.zip'
encoding = 'utf-8-sig'


data = []
with zipfile.ZipFile(data_file_path) as zfile:
    for name in zfile.namelist():
        with zfile.open(name) as readfile:
            for line in io.TextIOWrapper(readfile, encoding):
                data.append(line.replace('\n', '').split(','))

labels=['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 
       'DistanceFromHome', 'Education', 'EducationField', 'EducationField'
       "EmployeeCount","EmployeeNumber","EnvironmentSatisfaction","Gender","HourlyRate","JobInvolvement",
       "JobLevel","JobRole","JobSatisfaction","MaritalStatus","MonthlyIncome","MonthlyRate","NumCompaniesWorked",
       "Over18","OverTime","PercentSalaryHike","PerformanceRating","RelationshipSatisfaction","StandardHours",
       "StockOptionLevel","TotalWorkingYears","TrainingTimesLastYear","WorkLifeBalance","YearsAtCompany",
       "YearsInCurrentRole","YearsSinceLastPromotion","YearsWithCurrManager"
      ]

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
attrition_df = pd.DataFrame(data, columns=labels)
attrition_df = attrition_df.drop([0])


attrition_df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EducationFieldEmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
3,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
4,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
5,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
6,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,4,Male,79,3,1,Laboratory Technician,4,Single,3068,11864,0,Y,No,13,3,3,80,0,8,2,2,7,7,3,6
7,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,3,Female,81,4,1,Laboratory Technician,1,Married,2670,9964,4,Y,Yes,20,4,1,80,3,12,3,2,1,0,0,0
8,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,4,Male,67,3,1,Laboratory Technician,3,Divorced,2693,13335,1,Y,No,22,4,2,80,1,1,2,3,1,0,0,0
9,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,4,Male,44,2,3,Manufacturing Director,3,Single,9526,8787,0,Y,No,21,4,2,80,0,10,2,3,9,7,1,8
10,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,3,Male,94,3,2,Healthcare Representative,3,Married,5237,16577,6,Y,No,13,3,2,80,2,17,3,2,7,7,7,7


## Data Discovery

In [224]:
not_categorical_data = [
    "Attrition",
    "BusinessTravel",
    "Department",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "OverTime",
    "Over18"
]
  
pre_categorized_data = ["Education",
"EnvironmentSatisfaction",
"JobInvolvement",
"JobSatisfaction",
"PerformanceRating",
"RelationshipSatisfaction",
"WorkLifeBalance", 
"Gender",
"JobRole",
"StockOptionLevel"
]

In [225]:
attrition_df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EducationFieldEmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
3,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
4,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
5,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2
6,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,1,8,4,Male,79,3,1,Laboratory Technician,4,Single,3068,11864,0,Y,No,13,3,3,80,0,8,2,2,7,7,3,6
7,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,1,10,3,Female,81,4,1,Laboratory Technician,1,Married,2670,9964,4,Y,Yes,20,4,1,80,3,12,3,2,1,0,0,0
8,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,1,11,4,Male,67,3,1,Laboratory Technician,3,Divorced,2693,13335,1,Y,No,22,4,2,80,1,1,2,3,1,0,0,0
9,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,1,12,4,Male,44,2,3,Manufacturing Director,3,Single,9526,8787,0,Y,No,21,4,2,80,0,10,2,3,9,7,1,8
10,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,1,13,3,Male,94,3,2,Healthcare Representative,3,Married,5237,16577,6,Y,No,13,3,2,80,2,17,3,2,7,7,7,7


#### Pre-Categorized Categorical Data

Education
1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'

EnvironmentSatisfaction
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

JobInvolvement 
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

JobSatisfaction 1 'Low' 2 'Medium' 3 'High' 4 'Very High'

PerformanceRating 
1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'

RelationshipSatisfaction 
1 'Low' 2 'Medium' 3 'High' 4 'Very High'

WorkLifeBalance 1 'Bad' 2 'Good' 3 'Better' 4 'Best'

In [226]:
# NOT CATEGORIZED CATEGORICAL DATA
post_categorical_data = [
    "Attrition",
    "BusinessTravel",
    "Department",
    "EducationField",
    "Gender",
    "JobRole",
    "MaritalStatus",
    "OverTime"
]

### Data Conversion Pt 0 -> Drop Non Predictive Features

In [227]:
#FIND SERIES WITH NO VARIANCE/SINGLE VALUE -> THESE SERIES DO NOT HOLD PREDICTIVE VALUE AND CAN BE DROPPED
for series in attrition_df.columns:
        if (len(attrition_df[series].unique()) <= 1): 
            print(series, ' Will Be Dropped It has only,' ,' : ', attrition_df[series].unique(), ' Values') 
            attrition_df.drop(columns=[series], inplace=True)            

EducationFieldEmployeeCount  Will Be Dropped It has only,  :  ['1']  Values
Over18  Will Be Dropped It has only,  :  ['Y']  Values
StandardHours  Will Be Dropped It has only,  :  ['80']  Values


## Data Conversion Pt I -> Spreading Out Multiple Data Fields

In [228]:
#Setting BusinessTravel Datafields
attrition_df["TravelRarelyBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Travel_Rarely"]
attrition_df["TravelFrequentlyBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Travel_Frequently"]
attrition_df["NonTravelBool"] = attrition_df.BusinessTravel[attrition_df["BusinessTravel"] == "Non-Travel"]

#Setting Department Datafields
attrition_df["SalesDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Sales"]
attrition_df["ResearchAndDevelopmentDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Research & Development"]
attrition_df["HumanResourcesDepartmentBool"] = attrition_df.Department[attrition_df.Department == "Human Resources"]


# Education
# 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
attrition_df["EducationLevelBelowCollege"] = attrition_df.Education[attrition_df.Education == "1"]
attrition_df["EducationLevelCollege"] = attrition_df.Education[attrition_df.Education == "2"]
attrition_df["EducationLevelBachelor"] = attrition_df.Education[attrition_df.Education == "3"]
attrition_df["EducationLevelMaster"] = attrition_df.Education[attrition_df.Education == "4"]
attrition_df["EducationLevelDoctor"] = attrition_df.Education[attrition_df.Education == "5"]


# EnvironmentSatisfaction
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["EnvironmentSatisfactionLow"] = attrition_df.EnvironmentSatisfaction[attrition_df.EnvironmentSatisfaction == "1"]
attrition_df["EnvironmentSatisfactionMedium"] = attrition_df.EnvironmentSatisfaction[attrition_df.EnvironmentSatisfaction == "2"]
attrition_df["EnvironmentSatisfactionHigh"] = attrition_df.EnvironmentSatisfaction[attrition_df.EnvironmentSatisfaction == "3"]
attrition_df["EnvironmentSatisfactionVeryHigh"] = attrition_df.EnvironmentSatisfaction[attrition_df.EnvironmentSatisfaction == "4"]

# JobInvolvement 
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["JobInvolvementLow"] = attrition_df.JobInvolvement[attrition_df.JobInvolvement == "1"]
attrition_df["JobInvolvementMedium"] = attrition_df.JobInvolvement[attrition_df.JobInvolvement == "2"]
attrition_df["JobInvolvementHigh"] = attrition_df.JobInvolvement[attrition_df.JobInvolvement == "3"]
attrition_df["JobInvolvementVeryHigh"] = attrition_df.JobInvolvement[attrition_df.JobInvolvement == "4"]

# JobSatisfaction
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["JobSatisfactionLow"] = attrition_df.JobSatisfaction[attrition_df.JobSatisfaction == "1"]
attrition_df["JobSatisfactionMedium"] = attrition_df.JobSatisfaction[attrition_df.JobSatisfaction == "2"]
attrition_df["JobSatisfactionHigh"] = attrition_df.JobSatisfaction[attrition_df.JobSatisfaction == "3"]
attrition_df["JobSatisfactionVeryHigh"] = attrition_df.JobSatisfaction[attrition_df.JobSatisfaction == "4"]

# PerformanceRating 
# 1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'
attrition_df["PerformanceRatingLow"] = attrition_df.PerformanceRating[attrition_df.PerformanceRating == "1"]
attrition_df["PerformanceRatingGood"] = attrition_df.PerformanceRating[attrition_df.PerformanceRating == "2"]
attrition_df["PerformanceRatingExcellent"] = attrition_df.PerformanceRating[attrition_df.PerformanceRating == "3"]
attrition_df["PerformanceRatingOutstanding"] = attrition_df.PerformanceRating[attrition_df.PerformanceRating == "4"]


# RelationshipSatisfaction 
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["RelationshipSatisfactionLow"] = attrition_df.RelationshipSatisfaction[attrition_df.RelationshipSatisfaction == "1"]
attrition_df["RelationshipSatisfactionMedium"] = attrition_df.RelationshipSatisfaction[attrition_df.RelationshipSatisfaction == "2"]
attrition_df["RelationshipSatisfactionHigh"] = attrition_df.RelationshipSatisfaction[attrition_df.RelationshipSatisfaction == "3"]
attrition_df["RelationshipSatisfactionVeryHigh"] = attrition_df.RelationshipSatisfaction[attrition_df.RelationshipSatisfaction == "4"]

# WorkLifeBalance
# 1 'Bad' 2 'Good' 3 'Better' 4 'Best'
attrition_df["WorkLifeBalanceBad"] = attrition_df.WorkLifeBalance[attrition_df.WorkLifeBalance == "1"]
attrition_df["WorkLifeBalanceGood"] = attrition_df.WorkLifeBalance[attrition_df.WorkLifeBalance == "2"]
attrition_df["WorkLifeBalanceBetter"] = attrition_df.WorkLifeBalance[attrition_df.WorkLifeBalance == "3"]
attrition_df["WorkLifeBalanceBest"] = attrition_df.WorkLifeBalance[attrition_df.WorkLifeBalance == "4"]


#Setting EducationField Datafields
attrition_df["LifeScienceEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Life Sciences']
attrition_df["OtherEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Other']
attrition_df["MedicalEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Medical']
attrition_df["MarketingEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Marketing']
attrition_df["TechnicalEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Technical Degree']
attrition_df["HumanResourcesEducationBool"] = attrition_df.EducationField[attrition_df.EducationField == 'Human Resources']

#Setting Gender Datafields
attrition_df["Male"] = attrition_df.Gender[attrition_df.Gender == 'Male']
attrition_df["Female"] = attrition_df.Gender[attrition_df.Gender == 'Female']


#Setting JobRole Datafields
attrition_df["JobRoleSalesExecutiveBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Executive']
attrition_df["JobRoleResearchScientistBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Scientist']
attrition_df["JobRoleLaboratoryTechnicianBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Laboratory Technician']
attrition_df["JobRoleManufacturingDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manufacturing Director']
attrition_df["JobRoleHealthcareRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Healthcare Representative']
attrition_df["JobRoleManagerBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Manager']
attrition_df["JobRoleSalesRepresentativeBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Sales Representative']
attrition_df["JobRoleResearchDirectorBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Research Director']
attrition_df["JobRoleHumanResourcesBool"] = attrition_df.JobRole[attrition_df.JobRole == 'Human Resources']

#Setting MaritalStatus Datafields
attrition_df["DivorcedBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Divorced']
attrition_df["SingleBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Single']
attrition_df["MarriedBool"] = attrition_df.MaritalStatus[attrition_df.MaritalStatus == 'Married']

#Setting JobLevel Datafields
attrition_df["JobLevel1"] = attrition_df.JobLevel[attrition_df.JobLevel == "1"]
attrition_df["JobLevel2"] = attrition_df.JobLevel[attrition_df.JobLevel == "2"]
attrition_df["JobLevel3"] = attrition_df.JobLevel[attrition_df.JobLevel == "3"]
attrition_df["JobLevel4"] = attrition_df.JobLevel[attrition_df.JobLevel == "4"]
attrition_df["JobLevel5"] = attrition_df.JobLevel[attrition_df.JobLevel == "5"]

#Setting StockOptionLevel Datafields
attrition_df["StockOptionLevel0"] = attrition_df.StockOptionLevel[attrition_df.StockOptionLevel == "0"]
attrition_df["StockOptionLevel1"] = attrition_df.StockOptionLevel[attrition_df.StockOptionLevel == "1"]
attrition_df["StockOptionLevel2"] = attrition_df.StockOptionLevel[attrition_df.StockOptionLevel == "2"]
attrition_df["StockOptionLevel3"] = attrition_df.StockOptionLevel[attrition_df.StockOptionLevel == "3"]
attrition_df["StockOptionLevel4"] = attrition_df.StockOptionLevel[attrition_df.StockOptionLevel == "4"]

# attrition_df["OverTime"] = attrition_df.OverTime.map({"Yes": 1, "No": 0})

# # attrition_df["Over18"] = attrition_df.Over18.map({"Y": 1, "N": 0})

# # attrition_df

# # # attrition_df.Education


# # # for cat_data in attrition_df[pre_categorized_data + post_categorical_data]:
# # # #     print(cat_data, ' : ',attrition_df[cat_data].mode)
# # #     print(cat_data)
    
# # # attrition_df[pre_categorized_data + post_categorical_data].Education.mode()
# attrition_df


In [None]:
attrition_df.iloc[0]

In [None]:
attrition_df["JobLevel2"] = attrition_df["JobLevel2"].map({"2": 1, np.nan: 0})


attrition_df["JobLevel2"].iloc(0)

In [230]:
attrition_df["JobLevel2"].iloc[0]

1

## Mapping Data To Booleans

In [229]:
attrition_df["AttritionBool"] = attrition_df.Attrition.map({"Yes": 1, "No": 0})
attrition_df["OverTimeBool"] = attrition_df.OverTime.map({"Yes": 1, "No": 0})
attrition_df["TravelRarelyBool"] = attrition_df["TravelRarelyBool"].map({"Travel_Rarely": 1, np.nan: 0})
attrition_df["NonTravelBool"] = attrition_df["NonTravelBool"].map({"Non-Travel": 1, np.nan: 0})
attrition_df["TravelFrequentlyBool"] = attrition_df["TravelFrequentlyBool"].map({"Travel_Frequently": 1, np.nan: 0})
attrition_df["SalesDepartmentBool"] = attrition_df["SalesDepartmentBool"].map({"Sales": 1, np.nan: 0})
attrition_df["ResearchAndDevelopmentDepartmentBool"] = attrition_df["ResearchAndDevelopmentDepartmentBool"].map({"Research & Development": 1, np.nan: 0})
attrition_df["HumanResourcesDepartmentBool"] = attrition_df["HumanResourcesDepartmentBool"].map({"Human Resources": 1, np.nan: 0})


attrition_df["LifeScienceEducationBool"] = attrition_df["LifeScienceEducationBool"].map({"Life Sciences": 1, np.nan: 0})
attrition_df["OtherEducationBool"] = attrition_df["OtherEducationBool"].map({"Other": 1, np.nan: 0})
attrition_df["MedicalEducationBool"] = attrition_df["MedicalEducationBool"].map({"Medical": 1, np.nan: 0})
attrition_df["MarketingEducationBool"] = attrition_df["MarketingEducationBool"].map({"Marketing": 1, np.nan: 0})
attrition_df["TechnicalEducationBool"] = attrition_df["TechnicalEducationBool"].map({"Technical Degree": 1, np.nan: 0})
attrition_df["HumanResourcesEducationBool"] = attrition_df["HumanResourcesEducationBool"].map({"Human Resources": 1, np.nan: 0})

attrition_df["Male"] = attrition_df.Male.map({"Male": 1, np.nan: 0})
attrition_df["Female"] = attrition_df.Female.map({"Female": 1, np.nan: 0})


attrition_df["JobRoleSalesExecutiveBool"] = attrition_df["JobRoleSalesExecutiveBool"].map({"Sales Executive": 1, np.nan: 0})
attrition_df["JobRoleResearchScientistBool"] = attrition_df["JobRoleResearchScientistBool"].map({"Research Scientist": 1, np.nan: 0})
attrition_df["JobRoleLaboratoryTechnicianBool"] = attrition_df["JobRoleLaboratoryTechnicianBool"].map({"Laboratory Technician": 1, np.nan: 0})
attrition_df["JobRoleManufacturingDirectorBool"] = attrition_df["JobRoleManufacturingDirectorBool"].map({"Manufacturing Director": 1, np.nan: 0})
attrition_df["JobRoleHealthcareRepresentativeBool"] = attrition_df["JobRoleHealthcareRepresentativeBool"].map({"Healthcare Representative": 1, np.nan: 0})
attrition_df["JobRoleManagerBool"] = attrition_df["JobRoleManagerBool"].map({"Manager": 1, np.nan: 0})
attrition_df["JobRoleSalesRepresentativeBool"] = attrition_df["JobRoleSalesRepresentativeBool"].map({"Sales Representative": 1, np.nan: 0})
attrition_df["JobRoleResearchDirectorBool"] = attrition_df["JobRoleResearchDirectorBool"].map({"Research Director": 1, np.nan: 0})
attrition_df["JobRoleHumanResourcesBool"] = attrition_df["JobRoleHumanResourcesBool"].map({"Human Resources": 1, np.nan: 0})


attrition_df["DivorcedBool"] = attrition_df["DivorcedBool"].map({"Divorced": 1, np.nan: 0})
attrition_df["SingleBool"] = attrition_df["SingleBool"].map({"Single": 1, np.nan: 0})
attrition_df["MarriedBool"] = attrition_df["MarriedBool"].map({"Married": 1, np.nan: 0})


#################
# Re-Mapping Categorical Data
# Education Mapping
# 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
attrition_df["EducationLevelBelowCollege"] = attrition_df["EducationLevelBelowCollege"].map({"1": 1, np.nan: 0})
attrition_df["EducationLevelCollege"] = attrition_df["EducationLevelCollege"].map({"2": 1, np.nan: 0})
attrition_df["EducationLevelBachelor"] = attrition_df["EducationLevelBachelor"].map({"3": 1, np.nan: 0})
attrition_df["EducationLevelMaster"] = attrition_df["EducationLevelMaster"].map({"4": 1, np.nan: 0})
attrition_df["EducationLevelDoctor"] = attrition_df["EducationLevelDoctor"].map({"5": 1, np.nan: 0})



# EnvironmentSatisfaction
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["EnvironmentSatisfactionLow"] = attrition_df["EnvironmentSatisfactionLow"].map({"1": 1, np.nan: 0})
attrition_df["EnvironmentSatisfactionMedium"] = attrition_df["EnvironmentSatisfactionMedium"].map({"2": 1, np.nan: 0})
attrition_df["EnvironmentSatisfactionHigh"] = attrition_df["EnvironmentSatisfactionHigh"].map({"3": 1, np.nan: 0})
attrition_df["EnvironmentSatisfactionVeryHigh"] = attrition_df["EnvironmentSatisfactionVeryHigh"].map({"4": 1, np.nan: 0})


# JobInvolvement 
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["JobInvolvementLow"] = attrition_df["JobInvolvementLow"].map({"1": 1, np.nan: 0})
attrition_df["JobInvolvementMedium"] = attrition_df["JobInvolvementMedium"].map({"2": 1, np.nan: 0})
attrition_df["JobInvolvementHigh"] = attrition_df["JobInvolvementHigh"].map({"3": 1, np.nan: 0})
attrition_df["JobInvolvementVeryHigh"] = attrition_df["JobInvolvementVeryHigh"].map({"4": 1, np.nan: 0})

# JobSatisfaction
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["JobSatisfactionLow"] = attrition_df["JobSatisfactionLow"].map({"1": 1, np.nan: 0})
attrition_df["JobSatisfactionMedium"] = attrition_df["JobSatisfactionMedium"].map({"2": 1, np.nan: 0})
attrition_df["JobSatisfactionHigh"] = attrition_df["JobSatisfactionHigh"].map({"3": 1, np.nan: 0})
attrition_df["JobSatisfactionVeryHigh"] = attrition_df["JobSatisfactionVeryHigh"].map({"4": 1, np.nan: 0})

# PerformanceRating 
# 1 'Low' 2 'Good' 3 'Excellent' 4 'Outstanding'
attrition_df["PerformanceRatingLow"] = attrition_df["PerformanceRatingLow"].map({"1": 1, np.nan: 0})
attrition_df["PerformanceRatingGood"] = attrition_df["PerformanceRatingGood"].map({"2": 1, np.nan: 0})
attrition_df["PerformanceRatingExcellent"] = attrition_df["PerformanceRatingExcellent"].map({"3": 1, np.nan: 0})
attrition_df["PerformanceRatingOutstanding"] = attrition_df["PerformanceRatingOutstanding"].map({"4": 1, np.nan: 0})


# RelationshipSatisfaction 
# 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
attrition_df["RelationshipSatisfactionLow"] = attrition_df["RelationshipSatisfactionLow"].map({"1": 1, np.nan: 0})
attrition_df["RelationshipSatisfactionMedium"] = attrition_df["RelationshipSatisfactionMedium"].map({"2": 1, np.nan: 0})
attrition_df["RelationshipSatisfactionHigh"] = attrition_df["RelationshipSatisfactionHigh"].map({"3": 1, np.nan: 0})
attrition_df["RelationshipSatisfactionVeryHigh"] = attrition_df["RelationshipSatisfactionVeryHigh"].map({"4": 1, np.nan: 0})


# WorkLifeBalance
# 1 'Bad' 2 'Good' 3 'Better' 4 'Best'
attrition_df["WorkLifeBalanceBad"] = attrition_df["WorkLifeBalanceBad"].map({"1": 1, np.nan: 0})
attrition_df["WorkLifeBalanceGood"] = attrition_df["WorkLifeBalanceGood"].map({"2": 1, np.nan: 0})
attrition_df["WorkLifeBalanceBetter"] = attrition_df["WorkLifeBalanceBetter"].map({"3": 1, np.nan: 0})
attrition_df["WorkLifeBalanceBest"] = attrition_df["WorkLifeBalanceBest"].map({"4": 1, np.nan: 0})

#JOBLEVEL,STOCKOPTION SERIES IS NOT BEING HANDLED PROPERLY, NEEDS RELABELING AND MAPPING

attrition_df["JobLevel1"] = attrition_df["JobLevel1"].map({"1": 1, np.nan: 0})
attrition_df["JobLevel2"] = attrition_df["JobLevel2"].map({"2": 1, np.nan: 0})
attrition_df["JobLevel3"] = attrition_df["JobLevel3"].map({"3": 1, np.nan: 0})
attrition_df["JobLevel4"] = attrition_df["JobLevel4"].map({"4": 1, np.nan: 0})
attrition_df["JobLevel5"] = attrition_df["JobLevel5"].map({"5": 1, np.nan: 0})


attrition_df["StockOptionLevel0"] = attrition_df["StockOptionLevel0"].map({"0": 1, np.nan: 0})
attrition_df["StockOptionLevel1"] = attrition_df["StockOptionLevel1"].map({"1": 1, np.nan: 0})
attrition_df["StockOptionLevel2"] = attrition_df["StockOptionLevel2"].map({"2": 1, np.nan: 0})
attrition_df["StockOptionLevel3"] = attrition_df["StockOptionLevel3"].map({"3": 1, np.nan: 0})
attrition_df["StockOptionLevel4"] = attrition_df["StockOptionLevel4"].map({"4": 1, np.nan: 0})


In [231]:
attrition_df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,TravelRarelyBool,TravelFrequentlyBool,NonTravelBool,SalesDepartmentBool,ResearchAndDevelopmentDepartmentBool,HumanResourcesDepartmentBool,EducationLevelBelowCollege,EducationLevelCollege,EducationLevelBachelor,EducationLevelMaster,EducationLevelDoctor,EnvironmentSatisfactionLow,EnvironmentSatisfactionMedium,EnvironmentSatisfactionHigh,EnvironmentSatisfactionVeryHigh,JobInvolvementLow,JobInvolvementMedium,JobInvolvementHigh,JobInvolvementVeryHigh,JobSatisfactionLow,JobSatisfactionMedium,JobSatisfactionHigh,JobSatisfactionVeryHigh,PerformanceRatingLow,PerformanceRatingGood,PerformanceRatingExcellent,PerformanceRatingOutstanding,RelationshipSatisfactionLow,RelationshipSatisfactionMedium,RelationshipSatisfactionHigh,RelationshipSatisfactionVeryHigh,WorkLifeBalanceBad,WorkLifeBalanceGood,WorkLifeBalanceBetter,WorkLifeBalanceBest,LifeScienceEducationBool,OtherEducationBool,MedicalEducationBool,MarketingEducationBool,TechnicalEducationBool,HumanResourcesEducationBool,Male,Female,JobRoleSalesExecutiveBool,JobRoleResearchScientistBool,JobRoleLaboratoryTechnicianBool,JobRoleManufacturingDirectorBool,JobRoleHealthcareRepresentativeBool,JobRoleManagerBool,JobRoleSalesRepresentativeBool,JobRoleResearchDirectorBool,JobRoleHumanResourcesBool,DivorcedBool,SingleBool,MarriedBool,JobLevel1,JobLevel2,JobLevel3,JobLevel4,JobLevel5,StockOptionLevel0,StockOptionLevel1,StockOptionLevel2,StockOptionLevel3,StockOptionLevel4,AttritionBool,OverTimeBool
1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Yes,11,3,1,0,8,0,1,6,4,0,5,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1
2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,No,23,4,4,1,10,3,3,10,7,1,7,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0
3,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Yes,15,3,2,0,7,3,3,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,1
4,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Yes,11,3,3,0,8,3,3,8,7,3,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1
5,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,No,12,3,4,1,6,3,3,2,2,2,2,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0
6,32,No,Travel_Frequently,1005,Research & Development,2,2,Life Sciences,8,4,Male,79,3,1,Laboratory Technician,4,Single,3068,11864,0,No,13,3,3,0,8,2,2,7,7,3,6,0,1,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
7,59,No,Travel_Rarely,1324,Research & Development,3,3,Medical,10,3,Female,81,4,1,Laboratory Technician,1,Married,2670,9964,4,Yes,20,4,1,3,12,3,2,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1
8,30,No,Travel_Rarely,1358,Research & Development,24,1,Life Sciences,11,4,Male,67,3,1,Laboratory Technician,3,Divorced,2693,13335,1,No,22,4,2,1,1,2,3,1,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
9,38,No,Travel_Frequently,216,Research & Development,23,3,Life Sciences,12,4,Male,44,2,3,Manufacturing Director,3,Single,9526,8787,0,No,21,4,2,0,10,2,3,9,7,1,8,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0
10,36,No,Travel_Rarely,1299,Research & Development,27,3,Medical,13,3,Male,94,3,2,Healthcare Representative,3,Married,5237,16577,6,No,13,3,2,2,17,3,2,7,7,7,7,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0


In [None]:
#confirms that data types are no longer strings
print(attrition_df.applymap(type).eq(str).all())

#Convert all string numbers to integers
list_of_string_series = ['Age','DailyRate', 'DistanceFromHome',
                         'EmployeeNumber', 'HourlyRate', 'MonthlyIncome',
                         'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
                         'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
                         'YearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager'
                        ]

for series in list_of_string_series:
    attrition_df[series] = pd.to_numeric(attrition_df[series], errors='coerce')


print(attrition_df.applymap(type).eq(str).all())




#assert

# attrition_df.Attrition.head()

# pre_categorized_data = [
# "Education",
# "EnvironmentSatisfaction",
# "JobInvolvement",
# "JobSatisfaction",
# "PerformanceRating",
# "RelationshipSatisfaction",
# "WorkLifeBalance", 
# "Gender",
# "JobRole",
# "StockOptionLevel"
# ]



In [None]:
for series in attrition_df.columns:
    if attrition_df[series].dtype !=  np.dtype('int64'):#  attrition_df[series].dtype != np.dtype('O'):
        attrition_df.drop([series], axis=1, inplace=True)

In [None]:
attrition_df

In [None]:
#GET MODE FOR ALL CATEGORICAL DATA
attrition_df[pre_categorized_data + post_categorical_data].mode().iloc[0]

In [None]:
attrition_df.drop(columns=['EducationFieldEmployeeCount'], inplace=True)

In [None]:
attrition_df.columns

In [None]:
attrition_df.columns
# attrition_df



In [None]:
#ALL NONCATEGORICAL DATA
attrition_df.head

len(list(attrition_df))

list(attrition_df)
##mean range median mode(mrm) this is to get mean, range, mode and median for all noncategorical data
series_mrm = [
    'Age',
    'DailyRate',
    'DistanceFromHome',
    'EnvironmentSatisfaction',
    'HourlyRate',
    "JobSatisfaction",
    "MonthlyIncome",
    "MonthlyRate",
    "NumCompaniesWorked",
    "PercentSalaryHike",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "StockOptionLevel",
    "TotalWorkingYears",
    "TrainingTimesLastYear",
    "WorkLifeBalance",
    "YearsAtCompany",
    "YearsInCurrentRole",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager"
]




In [None]:
attrition_df.applymap(type).eq(str).all()

In [None]:
# #converts all data types to floats

# for series in attrition_df.columns:
#     if series == "DailyRate":
#         print('found series')
#         print(attrition_df[series].dtype)
#     if  attrition_df[series].dtype != 'object':
# #         attrition_df[series] = attrition_df[series].astype(float)
#           if series == 'AttritionBool':
#             print('found ' + 'AttritionBool' )
#           attrition_df[series] = pd.to_numeric(series)
#     if series == "DailyRate":
#         print(type(attrition_df[series]))
# #     print(attrition_df[series].dtype)
# #     if  attrition_df[series].dtype == 'object':
# #         print('is object')

# # attrition_df.head()

# attrition_df.applymap(type).eq(str).all()

In [None]:
attrition_df['Education']

In [None]:

# attrition_df[series_mrm].mean()
# attrition_df.Age.mean()

# type(attrition_df.Age.iloc[0])

#convert all data from string type to float


attrition_df[series_mrm].mean()
attrition_df[series_mrm].mode()
attrition_df[series_mrm].median()
# attrition_df[series_mrm].range()

# range = attrition_df.max() - attrition_df.min()

# range = (range ^ 2) / range


# range

### CONFIRM NULL VALUES

### Change Categorical Data To Numeric

### Boolean Data

'Attrition' 

Will be converted from 'yes' & 'no' to 1 & 0 respectively

In [None]:
attrition_df.isnull().sum()


In [None]:
# attrition_df.columns = attrition_df.columns.str.lower()
attrition_df

In [None]:
#CONFIRM NULL VALUES
attrition_df.isnull().values.any().sum()
print("Missing Values, Detail:", '\n', attrition_df.isnull().sum())
print('Total Missing Values:', attrition_df.isnull().sum().sum())

In [None]:
attrition_df.columns

In [None]:
attrition_df.dtypes

In [None]:
attrition_df.shape

In [None]:
attrition_df.values

In [None]:
attrition_df.info()

In [None]:
attrition_df['Education']

In [None]:
attrition_df.head()

In [None]:
attrition_df.Education.describe()

In [None]:
#prints the mean of non categorical data
# for column in attrition_df:
#     if not column in [pre_categorized_data + post_categorical_data]:
#         pp(column)
#         pp(attrition_df[column].mean())

In [None]:
attrition_df.BusinessTravel.unique()

In [None]:
#confirms that data types are no longer strings
attrition_df.applymap(type).eq(str).all()
# attrition_df.YearsWithCurrManager
attrition_df.TravelFrequentlyBool

In [None]:
attrition_df.DailyRate

In [None]:
attrition_df.head(1)

list(attrition_df.columns)


In [None]:
attrition_df.head()

# Hypothesis

### I predict that

- age
- education
- environmentsatisfaction
- monthlyrate
- hourlyrate
- dailyrate

will be factors/features that will be predictive in finding employee attrition

## Visualization

In [None]:
# hypothesized_predictors = [
# "Age",
# "Education",
# "EnvironmentSatisfaction",
# "MonthlyRate",
# "HourlyRate",
# "DailyRate"
# ]

# # EnvironmentSatisfaction
# # 1 'Low' 2 'Medium' 3 'High' 4 'Very High'
# data_env_sat = {'Low': 1, 'Medium': 2, 'High': 3, 'Very High': 4}
# data_env_sat_names = list(data_env_sat.keys())
# data_env_sat_values = list(data_env_sat.values())

# # Education
# # 1 'Below College' 2 'College' 3 'Bachelor' 4 'Master' 5 'Doctor'
# data_edu = {'Below College': 1, 'College': 2, 'Bachelor': 3, 'Master': 4, 'Doctor': 5}
# data_edu_names = list(data_edu.keys())
# data_edu_values = list(data_edu.values())

In [None]:
# # sns.regplot(y=attrition_df['AttritionBool'], x=attrition_df["DailyRate"], data=attrition_df)
# type(attrition_df["DailyRate"].unique()[0])
# # attrition_df['AttritionBool']

In [None]:
# # sns.set_style('ticks')

# sns.regplot(x=attrition_df['Attrition'], y=attrition_df["DailyRate"], data=attrition_df)

# # sns.set_style('ticks')
# # fig, ax = plt.subplots()
# # fig.set_size_inches(18.5, 10.5)
# # sns.regplot(data[:,0], data[:,1], ax=ax)
# # sns.despine()


In [None]:
# for factor in hypothesized_predictors:
#     print(factor)
#     if factor == "Education":
#         fig, axs = plt.subplots(1, 3, figsize=(9,3), sharey=True)

#         axs[0].bar(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[0].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[1].scatter(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[1].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[2].plot(data_env_sat_names, data_env_sat_values)
#         for xtick in axs[2].get_xticklabels():
#             xtick.set_rotation(45)
        
#         fig.suptitle('Categorical Plotting of Education')

#     elif factor == "EnvironmentSatisfaction":
#         fig, axs = plt.subplots(1, 3, figsize=(9,3), sharey=True)
        
#         axs[0].bar(data_edu_names, data_edu_values)
#         for xtick in axs[0].get_xticklabels():
#             xtick.set_rotation(45)
        
#         axs[1].scatter(data_edu_names, data_edu_values)
#         for xtick in axs[1].get_xticklabels():
#             xtick.set_rotation(45)
            
#         axs[2].plot(data_edu_names, data_edu_values)
#         axs[2].tick_params(axis='x', which='major', pad=115)
#         for xtick in axs[2].get_xticklabels():
#             xtick.set_rotation(45)


        
#         fig.suptitle('Categorical Plotting of Environment Satisfaction')
#     else:
#         plt.hist((attrition_df[factor]), bins=25, ec='black')
#         plt.xlabel(factor)
#         plt.ylabel('Count')
#         plt.show()

In [None]:
# plt.hist(attrition_df.Age, bins=25, ec='black')
# plt.xlabel('Age')
# plt.ylabel('Count')
# plt.show()

### Models



In [None]:
# NEW
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import cross_val_score, GridSearchCV
# from sklearn import cross_validation, neighbors

# OLD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import grid_search, cross_validation, neighbors

from sklearn import metrics

knn = KNeighborsClassifier(n_neighbors=7)

In [None]:
attrition_df.columns

In [None]:
feature_cols = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'AttritionBool',
       'TravelRarelyBool', 'TravelFrequentlyBool', 'NonTravelBool',
       'SalesDepartmentBool', 'ResearchAndDevelopmentDepartmentBool',
       'HumanResourcesDepartmentBool', 'LifeScienceEducationBool',
       'OtherEducationBool', 'MedicalEducationBool', 'MarketingEducationBool',
       'TechnicalEducationBool', 'HumanResourcesEducationBool', 'Male',
       'Female', 'SalesExecutiveBool', 'ResearchScientistBool',
       'LaboratoryTechnicianBool', 'ManufacturingDirectorBool',
       'HealthcareRepresentativeBool', 'ManagerBool',
       'SalesRepresentativeBool', 'ResearchDirectorBool', 'HumanResourcesBool',
       'DivorcedBool', 'SingleBool', 'MarriedBool', 'OverTimeBool']

In [None]:
#setting features and predictors for models
X = attrition_df.drop(['Attrition', 'AttritionBool', 'OverTime', 'Department', 'BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'], axis=1)
# X = attrition_df.drop(['Attrition', 'AttritionBool', 'Over18', 'Department', 'BusinessTravel', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus'], axis=1)

y = attrition_df.AttritionBool



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
#GET BASELINE

def get_baseline():
    print(y_test.value_counts()[0] / y_test.size if (y_test.value_counts()[0] / y_test.size) > (y_test.value_counts()[1] / y_test.size) else y_test.value_counts()[1] / y_test.size)  

get_baseline()

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_pred_class = knn.predict(X_test)

In [None]:
y_pred_class

In [None]:
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
# recall, the ability to predict true positives  tp / tp + fn
metrics.recall_score(y_test,y_pred_class)
# metrics.recall_score

In [None]:
knn.predict_proba(X_test)[:,1]

In [None]:
# # #ATTEMPT AT MANUAL CROSS VALIDATION, COULD BE USEFULE FOR OTHER MODELS

# from sklearn.neighbors import KNeighborsClassifier

# def finding_best_kN_value(value=100):
#     list_of_percentages = []
#     for i in list(range(1, value)):
#         knn = KneighborsClassifier(n_neighbors=i)
#         X_train, X_test, y_train, y_test = train_test_split(X, y)
#         knn.fit(X_train, y_train)
        
#         y_pred_class = knn.predict(X_test)
        
#         if y_pred_class > max(list_of_percentages): 
#             list_of_percentages.append(metrics.accuracy_score(y_test, y_pred_class))

#     return list_of_percentages

# finding_best_kN_value()
    

In [None]:
scores = cross_val_score(knn, X, y, cv=100, scoring="accuracy")
scores

In [None]:
k = list(range(1, 101))
params = {'n_neighbors': k}
kf = cross_validation.KFold(len(attrition_df), n_folds=5)
gs = grid_search.GridSearchCV(
    estimator = neighbors.KNeighborsClassifier(),
    param_grid=params,
    cv=kf
)

#NEW VERSION
# gs = GridSearchCV(
#     estimator = neighbors.KNeighborsClassifier(),
#     param_grid=params,
#     cv=kf,
#     return_train_score=True
# )

gs.fit(X, y)
gs.grid_scores_


# lowest_std = []
# highest_mean = []
# for values in gs.grid_scores_:
# #     print(values[1])
# #     print(type(values[1]))
# #     print(lowest_std[0][1])
# #     print(lowest_std[0])
#     if len(highest_mean) == 0:
#         highest_mean.append(values)
#     else:# values[1] > lowest_std[0][1]:
#         highest_mean.pop()
#         highest_mean.append(values)
        
#     if len(lowest_std) == 0:
#         lowest_std.append(values)
#     else:# values[1] > lowest_std[0][1]:
#         lowest_std.pop()
#         lowest_std.append(values)

In [None]:
gs

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [None]:
gs.best_estimator_

In [None]:
# pd.DataFrame(gs.grid_scores_

In [None]:
# lowest_std

In [None]:
# highest_mean

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X, y)

In [None]:
print(linreg.intercept_)
print(linreg.coef_)

### Find Best Predictors


In [None]:
attrition_formatted_data

In [None]:

import itertools
from sklearn.model_selection import train_test_split
from sklearn import metrics

def train_test_rmse(feature_cols):
    X = attrition_df[feature_cols]
    y = attrition_df.AttritionBool
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    
    return np.sqrt(metrics.mean_squared_error(y_test, y_pred))


features = [col for col in attrition_df.columns if col in (['OverTimeBool', 'TotalWorkingYears', 'MonthlyIncome'])]
feature_data_list_holder= []


for feature in range(1, len(features)):
    for subset in itertools.combinations(features, feature):
        score = train_test_rmse(list(subset))
        feature_data_list_holder.append([subset, score])

feature_data_list_holder




### Compare to Null RootMeanSquareError

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

y_null = np.zeros_like(y_test, dtype=float)

y_null.fill(y_test.mean())
y_null

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, y_null))

## Logistic Regression - ROC curves and AUC

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1e9)

logreg.fit(X_train, y_train)
y_pred_class = logreg.predict(X_test)

y_pred_class


### Evaluation

In [None]:
print('metrics.accuracy_score : ', metrics.accuracy_score(y_test, y_pred_class))
print('metrics.recall_score : ', metrics.recall_score(y_test,y_pred_class))
print('metrics.precision_score : ', metrics.precision_score(y_test, y_pred_class))
print('metrics.f1_score : ', metrics.f1_score(y_test, y_pred_class))
print('metrics.confusion_matrix : ', metrics.confusion_matrix(y_test, y_pred_class))

### PREDICT CONFUSION MATRIX RESULTS

In [None]:
# true negative tn, false positive fp, false negative fn, true positivie = tp
tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_class).ravel()

print('TN:{} | FP:{} | FN:{} | TP:{}'.format(tn, fp, fn, tp))

In [None]:
#classification report
print(metrics.classification_report(y_test, y_pred_class))

### ROC curves and AUC

In [None]:
y_pred_prob_for_roc_auc = logreg.predict_proba(X_test)[:, 1]

# y_pred_prob = logreg.predict_proba(X_test)[:, 1]
y_pred_prob_for_roc_auc

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (8,6)
plt.rcParams['font.size'] = 14

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob_for_roc_auc)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate -> FP/(FP+TN)')
plt.ylabel('True Positivie Rate -> Recall')

In [None]:
print(metrics.roc_auc_score(y_test, y_pred_prob_for_roc_auc))

### Done with KNN for now

In [None]:

# histogram of predicted probabilities grouped by actual response value
df = pd.DataFrame({'probability':y_pred_prob_for_roc_auc, 'actual':y_test})
df.hist(column='probability', by='actual', sharex=True, sharey=True)

### CROSS VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

In [None]:
USE ITERMETHOS TO GO THROUGH ALL COMBINATIONS AND FIND THE BEST CROSS VAL SCORE


EXAMPLE:
# # add Fare to the model
# feature_cols = ['Pclass', 'Parch', 'Age', 'Sex_Female', 'Embarked_Q', 'Embarked_S', 'Fare']
# X = titanic[feature_cols]

# # recalculate AUC
# cross_val_score(logreg, X, y, cv=10, scoring='roc_auc').mean()

## Decision Tree -> Classification Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
treeClf = DecisionTreeClassifier(random_state=1)

DTscores = cross_val_score(treeClf, X, y, cv=14, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-DTscores))

In [None]:
DTscores

In [None]:
treeClfDep1 = DecisionTreeClassifier(max_depth=1, random_state=1)

DTscores = cross_val_score(treeClfDep1, X, y, cv=14, scoring='neg_mean_squared_error')
np.mean(np.sqrt(-DTscores))

In [None]:
max_depth_range = range(1,8)

# list to store values
RMSE_scores = []

for depth in max_depth_range:
    treeClf = DecisionTreeClassifier(max_depth=depth, random_state=1)
    MSE_scores = cross_val_score(treeClf, X, y, cv=14, scoring="neg_mean_squared_error")
    RMSE_scores.append(np.mean(np.sqrt(-MSE_scores)))

In [None]:
RMSE_scores

In [None]:
plt.plot(max_depth_range, RMSE_scores)
plt.xlabel('max_depth')
plt.ylabel('RMSE (lower is better)')

In [None]:
treeClf = DecisionTreeClassifier(max_depth=2, random_state=1)
treeClf.fit(X, y)

In [None]:
# pd.DataFrame({'feature': X.columns, 'importance': treereg.feature_importances_})
values = pd.DataFrame({'feature': X.columns, 'importance': treeClf.feature_importances_}).sort_values(by='importance', ascending=False)
values.head(3)

### Tree Diagram

In [None]:
from sklearn.externals.six import StringIO
import pydot
from sklearn.tree import export_graphviz


export_graphviz(treeClf, out_file='tree_attrition.dot', feature_names=X.columns)
!dot -Tpng tree_attrition.dot -o tree_attrition.png

In [None]:
from PIL import Image
Image.open('tree_attrition.png')

### Using The Classification Tree

In [None]:
# pd.DataFrame(attrition_df["AttritionBool"])
# pd.DataFrame(y)

In [None]:
X_test = X
y_test = y
y_pred = treeClf.predict(X_test)
y_pred

# X_test.OverTimeBool

In [None]:
np.sqrt(metrics.mean_squared_error(y_test, y_pred))
#i don't know if this is good or bad