## Part 1: Preprocessing

In [363]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers

#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [364]:
# Determine the number of unique values in each column.
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [365]:
# Create y_df with the Attrition and Department columns

y_df = attrition_df[['Attrition', 'Department']]

# Display the first few rows of y_df to ensure it is created correctly
y_df.head()



Unnamed: 0,Attrition,Department
0,Yes,Sales
1,No,Research & Development
2,Yes,Research & Development
3,No,Research & Development
4,No,Research & Development


In [366]:
#
X_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 
                     'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 
                     'YearsAtCompany', 'YearsSinceLastPromotion', 'NumCompaniesWorked']
# Create X_df using the selected columns
X_df = attrition_df[X_columns]

# Show the data types for X_df
print(X_df.dtypes)



Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
dtype: object


In [367]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split



In [368]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Convert 'OverTime' column to numeric using Label Encoding
X_df['OverTime'] = label_encoder.fit_transform(X_df['OverTime'])

# Ensure all columns are numeric
X_df = X_df.apply(pd.to_numeric, errors='coerce')

# Check the data types after conversion
print(X_df.dtypes)



Education                  int64
Age                        int64
DistanceFromHome           int64
JobSatisfaction            int64
OverTime                   int64
StockOptionLevel           int64
WorkLifeBalance            int64
YearsAtCompany             int64
YearsSinceLastPromotion    int64
NumCompaniesWorked         int64
dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['OverTime'] = label_encoder.fit_transform(X_df['OverTime'])


In [369]:


# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=42)

# Create a StandardScaler
scaler = StandardScaler()

# Fit the StandardScaler to the training data
scaler.fit(X_train)

# Scale the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [370]:
# Define y as the target variable (Attrition)
y = attrition_df['Attrition']

# Ensure 'Department' is part of the feature set
X_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 
             'OverTime', 'StockOptionLevel', 'WorkLifeBalance', 
             'YearsAtCompany', 'YearsSinceLastPromotion', 
             'NumCompaniesWorked', 'Department']  # Include 'Department' here

# Create X_df using the selected columns
X_df = attrition_df[X_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.3, random_state=42)

# Create the OneHotEncoder for the 'Department' column
encoder = OneHotEncoder(sparse=False)

# Fit the encoder to the training data
encoder.fit(X_train[['Department']])

# Create two new variables by applying the encoder to the training and testing data
X_train_encoded = encoder.transform(X_train[['Department']])
X_test_encoded = encoder.transform(X_test[['Department']])

# Convert the encoded results back into DataFrame for readability
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=encoder.get_feature_names_out(['Department']))
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=encoder.get_feature_names_out(['Department']))

# Add the encoded department columns back to the original data
X_train_final = X_train.drop('Department', axis=1).join(X_train_encoded_df)
X_test_final = X_test.drop('Department', axis=1).join(X_test_encoded_df)




In [371]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Attrition column
encoder = OneHotEncoder(sparse=False)

# Fit the encoder to the training data
encoder.fit(y_train.values.reshape(-1, 1))

# Create two new variables by applying the encoder to the training and testing data
y_train_encoded = encoder.transform(y_train.values.reshape(-1, 1))
y_test_encoded = encoder.transform(y_test.values.reshape(-1, 1))

# Convert the encoded results back into DataFrame for readability
y_train_encoded_df = pd.DataFrame(y_train_encoded, columns=encoder.get_feature_names_out(['Attrition']))
y_test_encoded_df = pd.DataFrame(y_test_encoded, columns=encoder.get_feature_names_out(['Attrition']))




## Create, Compile, and Train the Model

In [373]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Find the number of columns in the X training data
num_columns = X_train.shape[1]

# Create the input layer
model = Sequential()
model.add(Dense(64, input_dim=num_columns, activation='relu'))  # Input layer with 64 units and ReLU activation

# Create at least two shared layers
model.add(Dense(64, activation='relu'))  # First shared layer
model.add(Dense(32, activation='relu'))  # Second shared layer


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [374]:



# Create the output layer
from tensorflow.keras.layers import Input, Dense

# Create the branch for Department
department_input = Input(shape=(1,), name='Department')  # Input layer for the Department feature

# Create the hidden layer for the Department branch
department_hidden = Dense(32, activation='relu')(department_input)  # Hidden layer with ReLU activation

# Create the output layer for the Department branch
department_output = Dense(1, activation='sigmoid')(department_hidden)  # Output layer with sigmoid activation


In [375]:


from tensorflow.keras.layers import Input, Dense

# Create the branch for Attrition
attrition_input = Input(shape=(1,), name='Attrition')  # Input layer for the Attrition feature

# Create the hidden layer for the Attrition branch
attrition_hidden = Dense(32, activation='relu')(attrition_input)  # Hidden layer with ReLU activation

# Create the output layer for the Attrition branch
attrition_output = Dense(1, activation='sigmoid')(attrition_hidden)  # Output layer with sigmoid activation


In [376]:
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Create the model by specifying inputs and outputs
model = Model(inputs=[attrition_input], outputs=[attrition_output])

# Compile the model with Adam optimizer and binary crossentropy loss
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Summarize the model
model.summary()


In [377]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (1029, 11)
Shape of y_train: (1029,)
Shape of X_test: (441, 11)
Shape of y_test: (441,)


In [378]:
# Check the data types of the columns in X_train
print(X_train.dtypes)


Education                   int64
Age                         int64
DistanceFromHome            int64
JobSatisfaction             int64
OverTime                   object
StockOptionLevel            int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsSinceLastPromotion     int64
NumCompaniesWorked          int64
Department                 object
dtype: object


In [379]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# List of categorical columns that need encoding
categorical_columns = ['OverTime', 'Department']

# Create a ColumnTransformer to apply OneHotEncoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ], remainder='passthrough'  # Leave other columns unchanged
)

# Apply the transformation to X_train and X_test
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)


In [380]:
# Ensure all data is numeric (converting to float32)
X_train_encoded = X_train_encoded.astype('float32')
X_test_encoded = X_test_encoded.astype('float32')


In [381]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable if needed (e.g., 'Yes' -> 1, 'No' -> 0)
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [382]:
# Create and compile the model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X_train_encoded.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # For binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])






In [383]:
# Summarize the model
model.summary()

In [384]:
# Train the model
history = model.fit(X_train_encoded, y_train_encoded, epochs=10, batch_size=32, validation_data=(X_test_encoded, y_test_encoded))



Epoch 1/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8446 - loss: 0.4653 - val_accuracy: 0.8254 - val_loss: 0.4618
Epoch 2/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8250 - loss: 0.4547 - val_accuracy: 0.8549 - val_loss: 0.3886
Epoch 3/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8380 - loss: 0.4128 - val_accuracy: 0.8526 - val_loss: 0.3910
Epoch 4/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8286 - loss: 0.4215 - val_accuracy: 0.8526 - val_loss: 0.3874
Epoch 5/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8385 - loss: 0.3938 - val_accuracy: 0.8526 - val_loss: 0.3903
Epoch 6/10
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8532 - loss: 0.3836 - val_accuracy: 0.8571 - val_loss: 0.3753
Epoch 7/10
[1m33/33[0m [32m━━━━━━━━━━

In [385]:
# Evaluate the model with the testing data
loss, accuracy = model.evaluate(X_test_encoded, y_test_encoded)



[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 879us/step - accuracy: 0.8544 - loss: 0.3692


In [386]:
# Print the accuracy
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 86.17%


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. Ans: 

    No, accuracy might not always be the best metric, especially if the classes are imbalanced (e.g., if most employees don’t leave, but some do). Accuracy can be misleading in such cases. It’s better to use metrics like precision, recall, or F1-score because they give a better idea of how well the model handles both classes (attrition and no attrition).

2. Ans:

    Softmax for the department output because it’s used for multi-class classification, where each input belongs to one of many possible classes (departments).
    Sigmoid for the attrition output because it’s used for binary classification, where the output is either 0 (no attrition) or 1 (attrition).

3. Ans:

    Handle imbalanced data: Use techniques like class weights or oversampling to make the model pay more attention to the minority class.
    Add dropout layers: To prevent overfitting and help the model generalize better.
    Tuning hyperparameters: Adjust the model’s settings like the number of neurons, learning rate, etc., to make it perform better.
    Try different models: Use models like LSTM or GRU if the data has a sequence or time component
 