In [218]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

from keras.layers import Dropout

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split


In [219]:
def preprocess_ohe(df, ohe_column_list):
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # List to store DataFrames of encoded columns
    encoded_dfs = []

    # Iterate over each column specified for one-hot encoding
    for column in ohe_column_list:
        # Initialize the OneHotEncoder
        ohe_encoder = OneHotEncoder(handle_unknown='error', sparse_output=False) # drop='first'
        
        # Fit and transform the data of the column
        ohe_encoded = ohe_encoder.fit_transform(df_copy[[column]])
        
        # Create a DataFrame from the encoded data with appropriate column names
        ohe_encoded_df = pd.DataFrame(ohe_encoded, columns=ohe_encoder.get_feature_names_out(input_features=[column]))
        
        # Append the newly encoded DataFrame to the list
        encoded_dfs.append(ohe_encoded_df)
        
        # Drop the original column from the copy of the DataFrame
        df_copy.drop(column, axis=1, inplace=True)
    
    # Concatenate all the encoded DataFrames with the modified original DataFrame
    df_copy = pd.concat([df_copy] + encoded_dfs, axis=1)
    
    return df_copy

In [220]:
def preprocess_yn(df, yn_columns):
    # Create a copy of the DataFrame to avoid modifying the original DataFrame
    df_copy = df.copy()
    
    # Replace 'Yes' with 1 and 'No' with 0 in specified columns
    replace_values = {'Yes': 1, 'No': 0}
    df_copy[yn_columns] = df_copy[yn_columns].replace(replace_values)
    
    return df_copy

In [221]:
def test_train_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    return X_train, X_test, y_train, y_test

In [222]:
def is_boolean_like(column):
    # Normalize the data to lowercase and check if the set of unique values is {'yes', 'no'}
    return set(column.str.lower().unique()) == {'yes', 'no'}

## Part 1: Preprocessing

In [223]:
#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [224]:
# Determine the number of unique values in each column. + some extra stuff
print("Columns with NaN values:", attrition_df.columns[attrition_df.isna().any()].tolist())


Columns with NaN values: []


In [225]:

print ()
print (f'Number of Unique Values:\n{attrition_df.nunique()}\n')
print (f'Object Types\n{attrition_df.dtypes}\n')
print(f'Rows: {attrition_df.shape[0]}\nColumns: {attrition_df.shape[1]}')


Number of Unique Values:
Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

Object Types
Age                          int64
Attrition                   object
BusinessTravel              object
Depar

In [226]:
# Convert your data to numeric data types however you see fit
# Add new code cells as necessary
object_df = attrition_df.select_dtypes(include=['object'])
# Filter out columns where the unique values are only 'Yes' and 'No'
non_boolean_object_col = object_df.loc[:, ~object_df.apply(is_boolean_like, axis=0)].columns.tolist()
boolean_object_col = object_df.loc[:, object_df.apply(is_boolean_like, axis=0)].columns.tolist()

# Loop through each column in object_df and print the unique values
for column in non_boolean_object_col:
    print(f"Unique values in '{column}': {object_df[column].unique()}")
for column in boolean_object_col:
    print(f"Unique values in '{column}': {object_df[column].unique()}")

Unique values in 'BusinessTravel': ['Travel_Rarely' 'Travel_Frequently' 'Non-Travel']
Unique values in 'Department': ['Sales' 'Research & Development' 'Human Resources']
Unique values in 'EducationField': ['Life Sciences' 'Other' 'Medical' 'Marketing' 'Technical Degree'
 'Human Resources']
Unique values in 'JobRole': ['Sales Executive' 'Research Scientist' 'Laboratory Technician'
 'Manufacturing Director' 'Healthcare Representative' 'Manager'
 'Sales Representative' 'Research Director' 'Human Resources']
Unique values in 'MaritalStatus': ['Single' 'Married' 'Divorced']
Unique values in 'Attrition': ['Yes' 'No']
Unique values in 'OverTime': ['Yes' 'No']


In [227]:
# Encode Features

attrition_encoded_df1 = preprocess_ohe(attrition_df,non_boolean_object_col)
attrition_encoded_df2  = preprocess_yn(attrition_encoded_df1,boolean_object_col)

print (attrition_encoded_df2)
print (attrition_encoded_df2.dtypes)

      Age  Attrition  DistanceFromHome  Education  EnvironmentSatisfaction  \
0      41          1                 1          2                        2   
1      49          0                 8          1                        3   
2      37          1                 2          2                        4   
3      33          0                 3          4                        4   
4      27          0                 2          1                        1   
...   ...        ...               ...        ...                      ...   
1465   36          0                23          2                        3   
1466   39          0                 6          1                        4   
1467   27          0                 4          3                        2   
1468   49          0                 2          3                        4   
1469   34          0                 8          3                        2   

      HourlyRate  JobInvolvement  JobLevel  JobSatisfaction  \


  df_copy[yn_columns] = df_copy[yn_columns].replace(replace_values)


In [228]:
# Create y_df with the Attrition and Department columns
y_df = attrition_encoded_df2.filter(regex='^Depart|Attrition')
y_department = attrition_encoded_df2.filter(regex='^Depart')
y_attrition = attrition_encoded_df2.filter(regex='^Attrition')
print (f'y_df before the split:\n {y_df}')
print (f'y_department:\n{y_department}')
print (f'y_attrition:\n{y_attrition}')

y_df before the split:
       Attrition  Department_Human Resources  \
0             1                         0.0   
1             0                         0.0   
2             1                         0.0   
3             0                         0.0   
4             0                         0.0   
...         ...                         ...   
1465          0                         0.0   
1466          0                         0.0   
1467          0                         0.0   
1468          0                         0.0   
1469          0                         0.0   

      Department_Research & Development  Department_Sales  
0                                   0.0               1.0  
1                                   1.0               0.0  
2                                   1.0               0.0  
3                                   1.0               0.0  
4                                   1.0               0.0  
...                                 ...            

In [229]:
# Create a list of at least 10 column names to use as X data
all_columns = attrition_encoded_df2.columns.tolist()
y_columns = y_df.columns.tolist()
feature_columns = [col for col in all_columns if col not in y_columns]

# Create X_df using your selected columns

X_df = attrition_encoded_df2[feature_columns]

print (feature_columns)
print (X_df.head())

['Age', 'DistanceFromHome', 'Education', 'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Marr

In [230]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test  =  train_test_split(X_df,y_df,random_state=42,test_size=.20)


In [231]:
# Step 1: Create a StandardScaler
scaler = StandardScaler()

# Step 2: Fit the StandardScaler to the training data
scaler.fit(X_train)

# Step 3: Transform the features training dataset
X_train_scaled = scaler.transform(X_train)

# Step 4: Transform the features testing dataset
X_test_scaled = scaler.transform(X_test)


In [232]:
# Create y_df with the Attrition and Department columns
y_department_train = y_train.filter(regex='^Depart')
y_attrition_train = y_train.filter(regex='^Attrition')
y_department_test = y_test.filter(regex='^Depart')
y_attrition_test = y_test.filter(regex='^Attrition')


## Create, Compile, and Train the Model

In [233]:
# Find the number of columns in the X training data
input_nodes  = X_train_scaled.shape[1]

num_nodes_first_hidden_layer = ((input_nodes+1) //2  )

# Define the number of hidden nodes for the second hidden layer

num_nodes_second_hidden_layer = ((num_nodes_first_hidden_layer+1) //2  )

# Create the input layer

input_layer = Input(shape=(42,), name='Input')

# Create at least two shared layers
x = Dense(64, activation='relu',name='shared1')(input_layer)
x = Dense(128, activation='relu',name='shared2')(x)



In [234]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
department_hidden = Dense(32, activation='relu',name='department_hidden')(x)
  
# Create the output layer

department_output = Dense(3, activation='softmax', name='department_output')(department_hidden)

In [235]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer
attrition_hidden = Dense(32, activation='relu',name='attrition_hidden')(x)

# Create the output layer
attrition_output = Dense(1, activation='sigmoid', name='attrition_output')(attrition_hidden)


In [236]:
# Create the model
model = Model(inputs=input_layer, outputs=[department_output, attrition_output])

# Compile the model
model.compile(optimizer='adam',
              loss={'department_output': 'categorical_crossentropy', 'attrition_output': 'binary_crossentropy'},
              metrics={'department_output': 'accuracy', 'attrition_output': 'accuracy'})

# Summarize the model
model.summary()

Model: "model_14"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input (InputLayer)             [(None, 42)]         0           []                               
                                                                                                  
 shared1 (Dense)                (None, 64)           2752        ['Input[0][0]']                  
                                                                                                  
 shared2 (Dense)                (None, 128)          8320        ['shared1[0][0]']                
                                                                                                  
 department_hidden (Dense)      (None, 32)           4128        ['shared2[0][0]']                
                                                                                           

In [237]:
# Train the model

# Fit the model
history = model.fit(
    X_train_scaled,
    {'department_output': y_department_train, 'attrition_output': y_attrition_train},
    epochs=50,
    batch_size=25,
    validation_split=0.30
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [238]:
# Evaluate the model with the testing data
test_results = model.evaluate(X_test_scaled, {'department_output': y_department_test, 'attrition_output': y_attrition_test})
test_results



[1.308653473854065,
 0.14732182025909424,
 1.1613315343856812,
 0.9591836929321289,
 0.8503401279449463]

In [239]:
# Print the accuracy for both department and attrition from the training history
print("Training Accuracy for Department Output: {:.2f}%".format(100 * history.history['department_output_accuracy'][-1]))
print("Training Accuracy for Attrition Output: {:.2f}%".format(100 * history.history['attrition_output_accuracy'][-1]))

print("Validation Accuracy for Department Output: {:.2f}%".format(100 * history.history['val_department_output_accuracy'][-1]))
print("Validation Accuracy for Attrition Output: {:.2f}%".format(100 * history.history['val_attrition_output_accuracy'][-1]))


Training Accuracy for Department Output: 100.00%
Training Accuracy for Attrition Output: 100.00%
Validation Accuracy for Department Output: 95.47%
Validation Accuracy for Attrition Output: 80.45%


# Summary

In the provided space below, briefly answer the following questions.

## 1. Is accuracy the best metric to use on this data? Why or why not?

**Answer:**
Accuracy is a common metric for classification problems, but it might not always be the best choice, especially in the context of imbalanced datasets. For the attrition data, which likely involves predicting employee turnover (a binary classification problem), accuracy might not be the best metric if one class (e.g., "No Attrition") significantly outweighs the other (e.g., "Attrition"). In such cases, the model could achieve high accuracy by simply predicting the majority class, but it would fail to correctly identify the minority class, which is often more important.

**Better Metrics:**
- **Precision and Recall:** These metrics provide a better understanding of the model's performance on the minority class.
  - **Precision:** The ratio of true positive predictions to the total predicted positives.
  - **Recall (Sensitivity):** The ratio of true positive predictions to the actual positives.
- **F1 Score:** The harmonic mean of precision and recall, providing a single metric that balances both concerns.
- **AUC-ROC (Area Under the Receiver Operating Characteristic Curve):** Measures the ability of the model to distinguish between classes, providing a good indication of performance across all classification thresholds.

## 2. What activation functions did you choose for your output layers, and why?

**Answer:**
For this model, I chose the following activation functions for the output layers:

- **Department Output (Categorical Output):** `Softmax`
  - **Reason:** The department output is a multi-class classification problem where each sample belongs to one of several departments. The softmax activation function is ideal for such problems as it converts the logits into probabilities that sum to 1, allowing for a clear determination of the predicted class.

- **Attrition Output (Binary Output):** `Sigmoid`
  - **Reason:** The attrition output is a binary classification problem where each sample either belongs to the "Attrition" class or the "No Attrition" class. The sigmoid activation function is suitable for binary classification because it maps the input to a value between 0 and 1, which can be interpreted as a probability for the positive class.

## 3. Can you name a few ways that this model might be improved?

**Answer:**
There are several ways to potentially improve this model:

1. **Feature Engineering:**
   - **Creating New Features:** Derive new features from existing ones to capture more relevant information.
   - **Handling Categorical Variables:** Use techniques like one-hot encoding, target encoding, or embeddings to better represent categorical variables.

2. **Hyperparameter Tuning:**
   - Use techniques such as Grid Search or Random Search to find the optimal hyperparameters for the model (e.g., learning rate, batch size, number of layers, number of neurons in each layer).

3. **Model Architecture:**
   - Experiment with different neural network architectures, such as deeper networks or adding regularization layers (Drop
