## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Unnamed: 0,0
Age,43
Attrition,2
BusinessTravel,3
Department,3
DistanceFromHome,29
Education,5
EducationField,6
EnvironmentSatisfaction,4
HourlyRate,71
JobInvolvement,4


In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df[['Attrition', 'Department']]
y_df.head()


Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [18]:
# Create a list of at least 10 column names to use as X data

#XData = attrition_df.drop(columns=['Attrition', 'Department'])

X_data2 = ['Education',
            'HourlyRate',
            'WorkLifeBalance',
            'YearsSinceLastPromotion',
            'YearsWithCurrManager',
            'EnvironmentSatisfaction',
            'Age',
            'JobInvolvement',
            'RelationshipSatisfaction',
            'YearsInCurrentRole',
            'OverTime']

# Create X_df using your selected columns
X_df = attrition_df[X_data2]

# Show the data types for X_df
X_df.dtypes


Unnamed: 0,0
Education,int64
HourlyRate,int64
WorkLifeBalance,int64
YearsSinceLastPromotion,int64
YearsWithCurrManager,int64
EnvironmentSatisfaction,int64
Age,int64
JobInvolvement,int64
RelationshipSatisfaction,int64
YearsInCurrentRole,int64


In [19]:
#Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df)

In [20]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
# EXAMPLE CODE: Students may choose different columns
X_train['OverTime'].value_counts()

Unnamed: 0_level_0,count
OverTime,Unnamed: 1_level_1
No,780
Yes,322


In [21]:
from sklearn.preprocessing import OneHotEncoder
overtime_encoder = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
overtime_encoder.fit(np.array(X_train['OverTime']).reshape(-1, 1))
X_train['OverTimeEncoded'] = overtime_encoder.transform(np.array(X_train['OverTime']).reshape(-1, 1))
X_test['OverTimeEncoded'] = overtime_encoder.transform(np.array(X_test['OverTime']).reshape(-1, 1))

X_train = X_train.drop(columns='OverTime')
X_test = X_test.drop(columns='OverTime')

print(X_train.dtypes)
print(X_test.dtypes)


Education                     int64
HourlyRate                    int64
WorkLifeBalance               int64
YearsSinceLastPromotion       int64
YearsWithCurrManager          int64
EnvironmentSatisfaction       int64
Age                           int64
JobInvolvement                int64
RelationshipSatisfaction      int64
YearsInCurrentRole            int64
OverTimeEncoded             float64
dtype: object
Education                     int64
HourlyRate                    int64
WorkLifeBalance               int64
YearsSinceLastPromotion       int64
YearsWithCurrManager          int64
EnvironmentSatisfaction       int64
Age                           int64
JobInvolvement                int64
RelationshipSatisfaction      int64
YearsInCurrentRole            int64
OverTimeEncoded             float64
dtype: object


In [22]:
# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler = scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [23]:
# Create a OneHotEncoder for the Department column
department_encoder = OneHotEncoder(sparse_output=False, handle_unknown = 'ignore', drop = None)

# Fit the encoder to the training data
ohe_train1 = department_encoder.fit(np.array(y_train['Department']).reshape(-1,1))
# Create two new variables by applying the encoder
# to the training and testing data
y_department_train = department_encoder.transform(np.array(y_train['Department']).reshape(-1,1))
y_department_test = department_encoder.transform(np.array(y_test['Department']).reshape(-1,1))



In [24]:
# Create a OneHotEncoder for the Attrition column
attrition_encoder = OneHotEncoder(sparse=False, handle_unknown = 'ignore', drop = None)

# Fit the encoder to the training data
ohe_train2 = attrition_encoder.fit(np.array(y_train['Attrition']).reshape(-1,1))


# Create two new variables by applying the encoder
# to the training and testing data
y_attrition_train = attrition_encoder.transform(np.array(y_train['Attrition']).reshape(-1,1))
y_attrition_test = attrition_encoder.transform(np.array(y_test['Attrition']).reshape(-1,1))




## Create, Compile, and Train the Model

In [25]:
# Find the number of columns in the X training data
input_columns = X_train.shape[1]  # Get the number of columns
# Create the input layer
input_layer = layers.Input(shape=(input_columns,), name = 'input') # Pass shape as a tuple

# Create at least two shared layers
shared1 = layers.Dense(22, activation = 'relu', name = 'shared1')(input_layer)
shared2 = layers.Dense(11, activation = 'relu', name = 'shared2')(shared1)

In [26]:
print(X_train.shape[1])

11


In [27]:
X_train.shape

(1102, 11)

In [28]:
# Create a branch for Department
# with a hidden layer and an output layer
# Create the hidden layer
department_hidden = layers.Dense(32, activation = 'relu', name = 'department_hidden')(shared2)


# Create the output layer
department_output = layers.Dense(2, activation = 'softmax', name = 'department_output')(department_hidden)



In [29]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = layers.Dense(32, activation = 'relu', name = 'attrition_hidden')(shared2)

# Create the output layer
attrition_output = layers.Dense(2, activation = 'softmax', name = 'attrition_output')(attrition_hidden)



In [30]:
# Check the shapes of your data and labels
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of y_department_train:", y_department_train.shape)
print("Shape of y_attrition_train:", y_attrition_train.shape)

# If the shapes don't match, investigate why there's a difference in the number of samples.
# You might need to revisit how you preprocessed your data or split it into training and validation sets.

Shape of X_train_scaled: (1102, 11)
Shape of y_department_train: (1102, 3)
Shape of y_attrition_train: (1102, 2)


In [31]:
X_train_scaled

array([[-1.86556985, -1.7234841 ,  0.35125833, ..., -0.65250778,
        -1.16511598, -0.6425111 ],
       [ 1.03545269,  0.97275448,  0.35125833, ..., -1.58757093,
         2.38701246, -0.6425111 ],
       [ 0.06844518,  1.65906976,  0.35125833, ...,  1.21761854,
         1.56729051, -0.6425111 ],
       ...,
       [ 1.03545269, -0.69401119,  1.80102115, ..., -0.65250778,
        -1.16511598,  1.55639335],
       [ 2.00246021, -1.67446158,  0.35125833, ...,  0.28255538,
        -1.16511598, -0.6425111 ],
       [-0.89856234, -0.79205623,  0.35125833, ..., -1.58757093,
        -0.34539403,  1.55639335]])

In [32]:
# Create the output layer for departments with the correct number of classes
department_output = layers.Dense(3, activation = 'softmax', name = 'department_output')(department_hidden)

# Create the model
model = Model(inputs = input_layer, outputs = [department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={
                  'department_output': 'categorical_crossentropy',
                  'attrition_output': 'binary_crossentropy'
              },
              metrics={
                  'department_output': 'accuracy',
                  'attrition_output': 'accuracy'
              })

In [33]:
# Train the model
New_model = model.fit(X_train_scaled,{'department_output': y_department_train,'attrition_output': y_attrition_train},epochs=10,
    batch_size=32,
    validation_split=0.2)

Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - attrition_output_accuracy: 0.8054 - department_output_accuracy: 0.1664 - loss: 1.7900 - val_attrition_output_accuracy: 0.8597 - val_department_output_accuracy: 0.4163 - val_loss: 1.6195
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - attrition_output_accuracy: 0.8318 - department_output_accuracy: 0.4725 - loss: 1.5783 - val_attrition_output_accuracy: 0.8597 - val_department_output_accuracy: 0.6471 - val_loss: 1.4179
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - attrition_output_accuracy: 0.8424 - department_output_accuracy: 0.6394 - loss: 1.3850 - val_attrition_output_accuracy: 0.8597 - val_department_output_accuracy: 0.6561 - val_loss: 1.2647
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - attrition_output_accuracy: 0.8292 - department_output_accuracy: 0.6621 - loss: 1.2584 - val_attri

In [34]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test, {'department_output': y_department_test, 'attrition_output': y_attrition_test})
test_results

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - attrition_output_accuracy: 0.8730 - department_output_accuracy: 0.6807 - loss: 6.5592 


[7.140815258026123, 0.8478260636329651, 0.6929348111152649]

In [35]:
# Print the accuracy for both department and attrition
print(f"Departmenty Accuracy: {test_results[0]}")
print(f"Attrition Accuracy: {test_results[1]}")

Departmenty Accuracy: 7.140815258026123
Attrition Accuracy: 0.8478260636329651


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

Accuracy is not the best metric to use for this data. Because the factors are not related enough to provide useful results.

2. What activation functions did you choose for your output layers, and why?

I use softmax on my output layers because given the different number of features I thought it would be important to emphasize normalization.

3. Can you name a few ways that this model might be improved?

The first way this model could be improved would be with significantly more data. So that the probabilites could be more accurate.
The second way that it could be improved would be with a more processing power to run a more robust and complext network.

YOUR ANSWERS HERE

1.
2.
3.