In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os

In [2]:
## Specify file name and directory for the dataset. ##
directory = './data/';
file = 'attrition.xlsx';

In [3]:
## Read file from the directory and transform into pandas dataframe ##
def read_data(file, directory):
    filename = directory + file;
    df = pd.read_excel(filename);
    
    return df;

In [4]:
## Read in the excel file into a pandas dataframe. ##
df = read_data(file, directory);

In [5]:
## Print dataframe. ##
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [6]:
## Instantiate new column names for each header. ##
new_columns = {
    "Age": "age",
    "Attrition": "attrition",
    "BusinessTravel": "travel",
    "DailyRate": "daily_rate",
    "Department": "department",
    "DistanceFromHome": "distance_home",
    "Education": "education",
    "EducationField": "education_field",
    "EmployeeCount": "emp_count",
    "EmployeeNumber": "emp_number",
    "EnvironmentSatisfaction": "env_satisfaction",
    "Gender": "gender",
    "HourlyRate": "hour_rate",
    "JobInvolvement": "job_involvement",
    "JobLevel": "job_level",
    "JobRole": "job_role",
    "JobSatisfaction": "job_satisfaction",
    "MaritalStatus": "marital_status",
    "MonthlyIncome": "monthly_income",
    "MonthlyRate": "month_rate",
    "NumCompaniesWorked": "companied_worked_number",
    "Over18": "over_18",
    "OverTime": "overtime",
    "PercentSalaryHike": "salary_hike",
    "PerformanceRating": "performance_rating",
    "RelationshipSatisfaction": "relationship_satisfaction",
    "StandardHours": "hours",
    "StockOptionLevel": "stock_option",
    "TotalWorkingYears": "total_working_yrs",
    "TrainingTimesLastYear": "training_times",
    "WorkLifeBalance": "wl_balance",
    "YearsAtCompany": "years_at_company",
    "YearsInCurrentRole": "years_current_role",
    "YearsSinceLastPromotion": "years_last_promotion",
    "YearsWithCurrManager": "years_current_manager",
};

In [7]:
## Rename dataframe column names to the better computable forms. ##
def column_renames(df, columns):
    new_df = df.rename(columns=columns);
    
    return new_df;

In [8]:
## Rename column names to the newly instantiated names. ##
df = column_renames(df, new_columns);

In [9]:
## Change String values into Integer ##
def str_to_int(df, column_name, true_value):
    for row, col in df.iterrows():
        df.at[row, column_name] = 1 if df.at[row, column_name] == true_value else 0;
        
    return df;

In [10]:
## Set Boolean dtype for columns with boolean values. Applying columns: Attrition, Over18, OverTime ##
df = str_to_int(df, 'attrition', "Yes");
df = str_to_int(df, 'overtime', "Yes");
df = str_to_int(df, 'over_18', "Y");

In [11]:
df

Unnamed: 0,age,attrition,travel,daily_rate,department,distance_home,education,education_field,emp_count,emp_number,...,relationship_satisfaction,hours,stock_option,total_working_yrs,training_times,wl_balance,years_at_company,years_current_role,years_last_promotion,years_current_manager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,0,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,0,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,0,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [12]:
## Remove non-necessary columns. Applying columns: Employee Count, Employee Number ##
df = df.drop(['emp_count', 'emp_number'], axis = 1);

In [13]:
df

Unnamed: 0,age,attrition,travel,daily_rate,department,distance_home,education,education_field,env_satisfaction,gender,...,relationship_satisfaction,hours,stock_option,total_working_yrs,training_times,wl_balance,years_at_company,years_current_role,years_last_promotion,years_current_manager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,80,1,17,3,3,5,2,0,3
1466,39,0,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,1,80,1,9,5,3,7,7,1,7
1467,27,0,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,2,80,1,6,0,3,6,2,0,3
1468,49,0,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,4,80,0,17,3,2,9,6,0,8


In [14]:
## Change String values into integer for better computation. Applying columns: BusinessTravel, Department,
## EducationField, Gender, JobRole, MaritalStatus

## - Find value instances, assign corresponding numbers.

##============= BusinessTravel =============##
travel_resp = list(set(df['travel'])); ## Non-Travel -> 0, Travel_Rarely -> 1, Travel_Frequently -> 2.

##============= Department =============##
department_resp = list(set(df['department'])); ## Sales -> 0, Research & Development -> 1, Human Resources -> 2.

##============= EducationField =============##
education_resp = list(set(df['education_field'])); ## Life Science -> 0, Medical -> 1, Technical Degree -> 2, Marketing -> 3, Human Resources -> 4, Other -> 5.

##============= Gender =============##
gender_resp = list(set(df['gender'])); ## Female -> 0, Male -> 1.

##============= JobRole =============##
job_resp = list(set(df['job_role'])); ## Laboratory Technician -> 0, Healthcare Representative -> 1, Manager -> 2, Sales Representative -> 3, Research Director -> 4, Manufacturing Director -> 5, Human Resources -> 6, Research Scientist -> 7, Sales Executive -> 8.

##============= MaritalStatus =============##
marital_resp = list(set(df['marital_status'])); ## Single -> 0, Married -> 1, Divorced -> 2.


## - Transform string values into integer values as assigned from the above.
for row, col in df.iterrows():
    df.at[row, 'travel'] = 0 if df.at[row, 'travel'] == "Non-Travel" else 1 if df.at[row, 'travel'] == "Travel_Rarely" else 2;
    df.at[row, 'department'] = 0 if df.at[row, 'department'] == "Sales" else 1 if df.at[row, 'department'] == "Research & Development" else 2;
    df.at[row, 'education_field'] = 0 if df.at[row, 'education_field'] == "Life Science" else 1 if df.at[row, 'education_field'] == "Medical" else 2 if df.at[row, 'education_field'] == "Technical Degree" else 3 if df.at[row, 'education_field'] == "Marketing" else 4 if df.at[row, 'education_field'] == "Human Resources" else 5;
    df.at[row, 'gender'] = 0 if df.at[row, 'gender'] == "Female" else 1;
    df.at[row, 'job_role'] = 0 if df.at[row, 'job_role'] == "Laboratory Technician" else 1 if df.at[row, 'job_role'] == "Healthcare Representative" else 2 if df.at[row, 'job_role'] == "Manager" else 3 if df.at[row, 'job_role'] == "Sales Representative" else 4 if df.at[row, 'job_role'] == "Research Director" else 5 if df.at[row, 'job_role'] == "Manufacturing Director" else 6 if df.at[row, 'job_role'] == "Human Resources" else 7 if df.at[row, 'job_role'] == "Research Scientist" else 8;
    df.at[row, 'marital_status'] = 0 if df.at[row, 'marital_status'] == "Single" else 1 if df.at[row, 'marital_status'] == "Married" else 2;

In [15]:
df

Unnamed: 0,age,attrition,travel,daily_rate,department,distance_home,education,education_field,env_satisfaction,gender,...,relationship_satisfaction,hours,stock_option,total_working_yrs,training_times,wl_balance,years_at_company,years_current_role,years_last_promotion,years_current_manager
0,41,1,1,1102,0,1,2,5,2,0,...,1,80,0,8,0,1,6,4,0,5
1,49,0,2,279,1,8,1,5,3,1,...,4,80,1,10,3,3,10,7,1,7
2,37,1,1,1373,1,2,2,5,4,1,...,2,80,0,7,3,3,0,0,0,0
3,33,0,2,1392,1,3,4,5,4,0,...,3,80,0,8,3,3,8,7,3,0
4,27,0,1,591,1,2,1,1,1,1,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,2,884,1,23,2,1,3,1,...,3,80,1,17,3,3,5,2,0,3
1466,39,0,1,613,1,6,1,1,4,1,...,1,80,1,9,5,3,7,7,1,7
1467,27,0,1,155,1,4,3,5,2,1,...,2,80,1,6,0,3,6,2,0,3
1468,49,0,2,1023,0,2,3,1,4,1,...,4,80,0,17,3,2,9,6,0,8


In [16]:
## Dimension of the dataframe. Row -> 1470, Column -> 33. ##
row_count = len(df);
column_count = len(df.columns);

In [17]:
index_resp = list(df.index);
attrition_resp = list(df['attrition']);

In [18]:
## Set up a global model configuration parameter ##
TRAIN_SPLIT = 0.8;

In [19]:
## Create a numpy array for each dependent and independent variable in association to the attrition outcome. ##
month_age = pd.DataFrame({"attrition":df['attrition'], "age":df['age']}).to_numpy();

In [20]:
## Compute the size of the training and testing datasets. ##
training_size = int(TRAIN_SPLIT * row_count);

In [21]:
## Run shuffling and random seed to ensure a reproducibility and split the data into training and validating dataset. ##
np.random.shuffle(month_age);
train_data, test_data = month_age[:training_size], month_age[training_size:];

In [22]:
## Create a model to compute Machine Learning ##
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, activation='relu', input_shape=(1,1)),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ]);
    
    return model;    

In [23]:
model = create_model();
model.summary();

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 10)             20        
                                                                 
 dense_1 (Dense)             (None, 1, 1)              11        
                                                                 
Total params: 31 (124.00 Byte)
Trainable params: 31 (124.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [24]:
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy']);

In [25]:
history = model.fit(train_data,
                   validation_data=test_data,
                   epochs=50,
                   verbose=1);

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()