In [39]:
# Import our dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [40]:
# total number of unique values.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [41]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]

In [42]:
# Create a list of at least 10 column names 
X_columns = ['Education','Age', 'DistanceFromHome', 'JobSatisfaction', 'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion','NumCompaniesWorked']


# Create X_df using your selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
X_df.dtypes

Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
dtype: object

In [43]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state = 42)

In [44]:
# Convert your X data to numeric data types 
# Add new code cells as necessary
X_train['OverTime'].value_counts()

OverTime
No     780
Yes    322
Name: count, dtype: int64

In [45]:
# Import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Encode the OverTime column for X_train and X_test
overtime_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
overtime_encoder.fit(np.array(X_train['OverTime']).reshape(-1, 1))

#Create a new column called OverTimeEncoded and drop the original column
X_train['OverTimeEncoded'] = overtime_encoder.transform(np.array(X_train['OverTime']).reshape(-1, 1))
X_test['OverTimeEncoded'] = overtime_encoder.transform(np.array(X_test['OverTime']).reshape(-1, 1))
X_train = X_train.drop(columns='OverTime')
X_test = X_test.drop(columns='OverTime')

print(X_train.dtypes)
print(X_test.dtypes)

Education                    int64
Age                          int64
DistanceFromHome             int64
JobSatisfaction              int64
StockOptionLevel             int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
NumCompaniesWorked           int64
OverTimeEncoded            float64
dtype: object
Education                    int64
Age                          int64
DistanceFromHome             int64
JobSatisfaction              int64
StockOptionLevel             int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
NumCompaniesWorked           int64
OverTimeEncoded            float64
dtype: object


In [46]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# Create a OneHotEncoder for the Department column
dept_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit the encoder to the training data
dept_encoder.fit(np.array(y_train['Department'].values.reshape(-1, 1)))

# Create two new variables by applying the encoder
# to the training and testing data
y_dept_train = dept_encoder.transform(np.array(y_train['Department'].values.reshape(-1, 1)))
y_dept_test = dept_encoder.transform(np.array(y_test['Department'].values.reshape(-1, 1)))
y_dept_train

array([[0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [48]:
# Create a OneHotEncoder for the Attrition column
atr_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)

# Fit the encoder to the training data
atr_encoder.fit(np.array(y_train['Attrition'].values.reshape(-1, 1)))


# Create two new variables by applying the encoder
# to the training and testing data
y_atr_train = atr_encoder.transform(np.array(y_train['Attrition'].values.reshape(-1, 1)))
y_atr_test = atr_encoder.transform(np.array(y_test['Attrition'].values.reshape(-1, 1)))
y_atr_train

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [49]:
# Find the number of columns in the X training data.
number_input_features = len(X_train.columns)
# Create the input layer
input_layer = layers.Input(shape=(number_input_features,))
# Create at least two shared layers
shared1 = layers.Dense(64, activation='relu', name = "shared1")(input_layer)
shared2 = layers.Dense(128, activation='relu', name = "shared2")(shared1)

In [50]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden = layers.Dense(32, activation='relu', name = 'department_hidden')(shared2)

# Create the output layer
department_output = layers.Dense(y_dept_train.shape[1], activation='softmax', name = 'department_output')(department_hidden)

In [51]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = layers.Dense(32, activation='relu', name = 'attrition_hidden')(shared2)


# Create the output layer
attrition_output = layers.Dense(y_atr_train.shape[1], activation='sigmoid', name = 'attrition_output')(department_hidden)

In [52]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
                loss={'department_output': 'categorical_crossentropy',
                        'attrition_output': 'categorical_crossentropy'},
                metrics={
                    'department_output': 'accuracy',
                    'attrition_output': 'accuracy'
                })

# Summarize the model
model.summary()

In [53]:
# Train the model
fit_model = model.fit(X_train_scaled, {'department_output': y_dept_train, 'attrition_output': y_atr_train}, epochs=100, batch_size=32)

Epoch 1/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 847us/step - attrition_output_accuracy: 0.7478 - department_output_accuracy: 0.6471 - loss: 1.4559
Epoch 2/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 736us/step - attrition_output_accuracy: 0.8288 - department_output_accuracy: 0.6508 - loss: 1.2143
Epoch 3/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - attrition_output_accuracy: 0.8268 - department_output_accuracy: 0.6555 - loss: 1.1539 
Epoch 4/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 756us/step - attrition_output_accuracy: 0.8442 - department_output_accuracy: 0.6380 - loss: 1.1443
Epoch 5/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 754us/step - attrition_output_accuracy: 0.8531 - department_output_accuracy: 0.6432 - loss: 1.1186
Epoch 6/100
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 729us/step - attrition_output_accuracy: 0.8712 - d

In [54]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, {'department_output': y_dept_test, 'attrition_output': y_atr_test})

test_results

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 915us/step - attrition_output_accuracy: 0.8393 - department_output_accuracy: 0.4935 - loss: 4.2582


[3.656771183013916, 0.85326087474823, 0.5135869383811951]

In [55]:
# Print the accuracy for both department and attrition
print(f"Department predictions accuracy: {test_results[1]}")
print(f"Attrition predictions accuracy: {test_results[2]}")

Department predictions accuracy: 0.85326087474823
Attrition predictions accuracy: 0.5135869383811951
