## Part 1: Preprocessing

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
# from tensorflow.keras.models import Model
# from tensorflow.keras import layers



In [3]:
#  Import and read the attrition data
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [4]:
# Determine the number of unique values in each column.
df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [5]:
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EnvironmentSatisfaction      int64
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
NumCompaniesWorked           int64
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StockOptionLevel             int64
TotalWorkingYears            int64
TrainingTimesLastYear        int64
WorkLifeBalance              int64
YearsAtCompany               int64
YearsInCurrentRole           int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [6]:
# Create y_df with the Attrition and Department columns
y_df = df[['Attrition', 'Department']]
y_df.head()

Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [7]:
# Create a list of at least 10 column names to use as X data
features_list = ['Education', 'Age', 'DistanceFromHome', 'NumCompaniesWorked', 'PerformanceRating', 'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked']


# Create X_df using your selected columns
X_df = df[features_list]

# Show the data types for X_df
X_df.dtypes

Education                  int64
Age                        int64
DistanceFromHome           int64
NumCompaniesWorked         int64
PerformanceRating          int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
dtype: object

In [8]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=1)

In [9]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary

# already chose all numerics

In [10]:
# X_train['OverTime'] = X_train['OverTime'].map({'yes': 1, 'no': 0})
# X_train['OverTime'].dtypes

In [11]:
# Create a StandardScaler
X_scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns = X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns = X_test.columns)

In [12]:
def EncodeTrain(feature_name: str):
    # Create a OneHotEncoder for the column
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    
    # Fit the encoder to the training data
    encoded = one_hot_encoder.fit(y_train[[feature_name]])
    
    # Create two new variables by applying the encoder
    # to the training and testing data
    encoded_train = one_hot_encoder.transform(y_train[[feature_name]])
    
    columns = one_hot_encoder.get_feature_names_out([feature_name])
    
    encoded_train_df = pd.DataFrame(encoded_train, columns=columns)

    globals()[f"{feature_name}_encoded_train_df"] = encoded_train_df
    

def EncodeTest(feature_name: str):
    # Create a OneHotEncoder for the column
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    
    # Fit the encoder to the training data
    encoded = one_hot_encoder.fit(y_train[[feature_name]])
    
    # Create two new variables by applying the encoder
    # to the training and testing data
    encoded_test = one_hot_encoder.transform(y_test[[feature_name]])
    
    columns = one_hot_encoder.get_feature_names_out([feature_name])
    
    encoded_test_df = pd.DataFrame(encoded_test, columns=columns)

    globals()[f"{feature_name}_encoded_test_df"] = encoded_test_df


In [13]:
# Create a OneHotEncoder for the Department column

EncodeTrain('Department')
EncodeTest('Department')

In [14]:
# Create a OneHotEncoder for the Attrition column

EncodeTrain('Attrition')
EncodeTest('Attrition')

In [15]:
# renaming for later use in PyTorch NN
Y_train_department = Department_encoded_train_df
Y_train_attrition = Attrition_encoded_train_df
Y_test_department = Department_encoded_test_df
Y_test_attrition = Attrition_encoded_test_df

## Create, Compile, and Train the Model

In [17]:
!pip install torch



In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F 

In [19]:
# Find the number of columns in the X training data
num_features = X_train_scaled.shape[1]
print(f"Number of features: {num_features}")

# Create the input layer and shared layers

'''
I took it on myself to use PyTorch since I know it is the most widely used framework by far for deep learning and neural nets

I have no idea if graders are familiar with it, but it should be relatively easy to understand if you understand deep learning or TensorFlow

A class is created that inherits from torch.nn.Module, where the constructor defines layers
Forward method defines activation functions/how data is passed between layers

Of course feedback will be given, would love substantive feedback on the actual qualities of the NN as that is where I am least confident 
for example when to use activation functions etc, my understanding feels rudimentary
'''

class AttritionModel(nn.Module):
    def __init__(self, num_features, num_dept_classes, num_attr_classes): # use constructor in PyTorch to define layers
        super(AttritionModel, self).__init__()
        self.input_layer = nn.Linear(num_features, 64)   # input layer
        self.shared_layer1 = nn.Linear(64, 128)          # first shared layer
        self.shared_layer2 = nn.Linear(128, 256)         # second shared layer
        self.shared_layer3 = nn.Linear(256, 128)         # third shared layer

        # Department branch
        self.dept_hidden = nn.Linear(128, 64)
        self.dept_output = nn.Linear(64, num_dept_classes)

        # Attrition branch
        self.attr_hidden = nn.Linear(128, 64)
        self.attr_output = nn.Linear(64, num_attr_classes)

    def forward(self, x): # PyTorch method for activation functions, separate method within class and activation functions correspond to layers
        # Input layer
        x = torch.relu(self.input_layer(x))
        # Shared layers
        x = torch.relu(self.shared_layer1(x))
        x = torch.relu(self.shared_layer2(x))
        x = torch.relu(self.shared_layer3(x))

        # Department branch
        dept = torch.relu(self.dept_hidden(x))
        dept_output = self.dept_output(dept)

        # Attrition branch
        attr = torch.relu(self.attr_hidden(x))
        attr_output = self.attr_output(attr)

        return dept_output, attr_output

# number of classes for Department and Attrition
num_dept_classes = Y_train_department.shape[1]
num_attr_classes = Y_train_attrition.shape[1]






Number of features: 10


In [20]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer

# Create the output layer

# see # department branch and # attrition branch in the class above for branch layers

In [21]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer


# Create the output layer



In [22]:
# Create the model
model = AttritionModel(num_features, num_dept_classes, num_attr_classes)

# Compile the model
# optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# loss functions
criterion_dept = nn.CrossEntropyLoss()
criterion_attr = nn.CrossEntropyLoss()

# Summarize the model
print(model)




AttritionModel(
  (input_layer): Linear(in_features=10, out_features=64, bias=True)
  (shared_layer1): Linear(in_features=64, out_features=128, bias=True)
  (shared_layer2): Linear(in_features=128, out_features=256, bias=True)
  (shared_layer3): Linear(in_features=256, out_features=128, bias=True)
  (dept_hidden): Linear(in_features=128, out_features=64, bias=True)
  (dept_output): Linear(in_features=64, out_features=3, bias=True)
  (attr_hidden): Linear(in_features=128, out_features=64, bias=True)
  (attr_output): Linear(in_features=64, out_features=2, bias=True)
)


In [23]:
# Train the model
# convert Y_train to class indices if they are one-hot encoded
import numpy as np

Y_train_department_indices = np.argmax(Y_train_department, axis=1)
Y_train_attrition_indices = np.argmax(Y_train_attrition, axis=1)

# convert training data to tensors, must take PyTorch tensors for PyTorch
inputs = torch.tensor(X_train_scaled, dtype=torch.float32)
dept_labels = torch.tensor(Y_train_department_indices, dtype=torch.long)
attr_labels = torch.tensor(Y_train_attrition_indices, dtype=torch.long)

# training loop
num_epochs = 400

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # forward pass
    dept_outputs, attr_outputs = model(inputs)

    # compute losses
    loss_dept = criterion_dept(dept_outputs, dept_labels)
    loss_attr = criterion_attr(attr_outputs, attr_labels)
    loss = loss_dept + loss_attr

    # backward pass and optimization
    loss.backward()
    optimizer.step()

    # print loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [10/400], Loss: 1.4872
Epoch [20/400], Loss: 1.2376
Epoch [30/400], Loss: 1.2121
Epoch [40/400], Loss: 1.1941
Epoch [50/400], Loss: 1.1808
Epoch [60/400], Loss: 1.1628
Epoch [70/400], Loss: 1.1397
Epoch [80/400], Loss: 1.1101
Epoch [90/400], Loss: 1.0751
Epoch [100/400], Loss: 1.0320
Epoch [110/400], Loss: 0.9760
Epoch [120/400], Loss: 0.9012
Epoch [130/400], Loss: 0.8249
Epoch [140/400], Loss: 0.7461
Epoch [150/400], Loss: 0.6790
Epoch [160/400], Loss: 0.6076
Epoch [170/400], Loss: 0.5528
Epoch [180/400], Loss: 0.4818
Epoch [190/400], Loss: 0.4484
Epoch [200/400], Loss: 0.3850
Epoch [210/400], Loss: 0.3659
Epoch [220/400], Loss: 0.3014
Epoch [230/400], Loss: 0.2655
Epoch [240/400], Loss: 0.2323
Epoch [250/400], Loss: 0.2164
Epoch [260/400], Loss: 0.1848
Epoch [270/400], Loss: 0.1586
Epoch [280/400], Loss: 0.1350
Epoch [290/400], Loss: 0.1231
Epoch [300/400], Loss: 0.1155
Epoch [310/400], Loss: 0.0826
Epoch [320/400], Loss: 0.0714
Epoch [330/400], Loss: 0.0572
Epoch [340/400], Lo

In [24]:
# Evaluate the model with the testing data

'''
This part was definitely more work than TensorFlow 

PyTorch does not have a built in evaluate method with the same level of functionality, however offers more customizability in this space 
'''


# Convert Y_test to class indices if they are one-hot encoded
Y_test_department_indices = np.argmax(Y_test_department, axis=1)
Y_test_attrition_indices = np.argmax(Y_test_attrition, axis=1)

# Convert testing data to tensors
inputs = torch.tensor(X_test_scaled, dtype=torch.float32)
dept_labels = torch.tensor(Y_test_department_indices, dtype=torch.long)
attr_labels = torch.tensor(Y_test_attrition_indices, dtype=torch.long)

# Evaluation
model.eval()
with torch.no_grad():
    dept_outputs, attr_outputs = model(inputs)

    # Compute accuracies
    _, dept_predicted = torch.max(dept_outputs, 1)
    _, attr_predicted = torch.max(attr_outputs, 1)

    total = dept_labels.size(0)

    correct_dept = (dept_predicted == dept_labels).sum().item()
    correct_attr = (attr_predicted == attr_labels).sum().item()

    accuracy_dept = 100 * correct_dept / total
    accuracy_attr = 100 * correct_attr / total

    print(f"Test Accuracy for Department: {accuracy_dept:.2f}%")
    print(f"Test Accuracy for Attrition: {accuracy_attr:.2f}%")

Test Accuracy for Department: 48.91%
Test Accuracy for Attrition: 74.73%


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. A classification report would be better than accuracy, considering this is a classification problem. It would also be good to get a confusion matrix too to see the breakdown of false/true positives/negatitives.
2. Softmax - which is implemented via Cross Entropy Loss. CEL implements softmax and negative log likelihood, softmax accounts for relationships between probabilities of outcomes and is non linear, NLL penalizes low probabilities for better results.
3. Having more data is always a plus for Neural Nets/deep learning. It also would be a good idea to include more features for the X data, considering we have way more than just 10. It is likely that there are more viable features than just the 10 I chose.