## Part 1: Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import LSTM


#  Import and read the attrition data
attrition_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m19/lms/datasets/attrition.csv')
attrition_df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,2,Life Sciences,2,94,3,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,1,Life Sciences,3,61,2,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,2,Other,4,92,2,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,4,Life Sciences,4,56,3,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,1,Medical,1,40,3,...,3,4,1,6,3,3,2,2,2,2


In [2]:
# Determine the number of unique values in each column
attrition_df.nunique()

Age                         43
Attrition                    2
BusinessTravel               3
Department                   3
DistanceFromHome            29
Education                    5
EducationField               6
EnvironmentSatisfaction      4
HourlyRate                  71
JobInvolvement               4
JobLevel                     5
JobRole                      9
JobSatisfaction              4
MaritalStatus                3
NumCompaniesWorked          10
OverTime                     2
PercentSalaryHike           15
PerformanceRating            2
RelationshipSatisfaction     4
StockOptionLevel             4
TotalWorkingYears           40
TrainingTimesLastYear        7
WorkLifeBalance              4
YearsAtCompany              37
YearsInCurrentRole          19
YearsSinceLastPromotion     16
YearsWithCurrManager        18
dtype: int64

In [3]:
# Create y_df with the Attrition and Department columns
y_df = attrition_df [['Attrition','Department']]
y_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Attrition   1470 non-null   object
 1   Department  1470 non-null   object
dtypes: object(2)
memory usage: 23.1+ KB


In [4]:
# Create a list of at least 10 column names to use as X data
x_columns = ['Education', 'Age', 'DistanceFromHome', 'JobSatisfaction', 'OverTime', 
             'StockOptionLevel', 'WorkLifeBalance', 'YearsAtCompany', 
             'YearsSinceLastPromotion', 'NumCompaniesWorked']

# Create X_df using your selected columns
X_df = attrition_df[x_columns]

# Show the data types for X_df
X_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Education                1470 non-null   int64 
 1   Age                      1470 non-null   int64 
 2   DistanceFromHome         1470 non-null   int64 
 3   JobSatisfaction          1470 non-null   int64 
 4   OverTime                 1470 non-null   object
 5   StockOptionLevel         1470 non-null   int64 
 6   WorkLifeBalance          1470 non-null   int64 
 7   YearsAtCompany           1470 non-null   int64 
 8   YearsSinceLastPromotion  1470 non-null   int64 
 9   NumCompaniesWorked       1470 non-null   int64 
dtypes: int64(9), object(1)
memory usage: 115.0+ KB


In [5]:
#Defining y and X
y = y_df
print (y)

X = X_df
X

     Attrition              Department
0          Yes                   Sales
1           No  Research & Development
2          Yes  Research & Development
3           No  Research & Development
4           No  Research & Development
...        ...                     ...
1465        No  Research & Development
1466        No  Research & Development
1467        No  Research & Development
1468        No                   Sales
1469        No  Research & Development

[1470 rows x 2 columns]


Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,OverTime,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked
0,2,41,1,4,Yes,0,1,6,0,8
1,1,49,8,2,No,1,3,10,1,1
2,2,37,2,3,Yes,0,3,0,0,6
3,4,33,3,3,Yes,0,3,8,3,1
4,1,27,2,2,No,1,3,2,2,9
...,...,...,...,...,...,...,...,...,...,...
1465,2,36,23,4,No,1,3,5,0,4
1466,1,39,6,1,No,1,3,7,1,4
1467,3,27,4,2,Yes,1,3,6,0,1
1468,3,49,2,2,No,0,2,9,0,2


In [6]:
y1 = y_df[['Attrition']]
print (y1)
y2 = y_df[['Department']]
print (y2)

     Attrition
0          Yes
1           No
2          Yes
3           No
4           No
...        ...
1465        No
1466        No
1467        No
1468        No
1469        No

[1470 rows x 1 columns]
                  Department
0                      Sales
1     Research & Development
2     Research & Development
3     Research & Development
4     Research & Development
...                      ...
1465  Research & Development
1466  Research & Development
1467  Research & Development
1468                   Sales
1469  Research & Development

[1470 rows x 1 columns]


In [7]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split


X_train,X_test,y_Attrition_train,y_Attrition_test, y_Department_train,y_Department_test = train_test_split(X,y1,y2,random_state=42)

In [8]:
y_Department_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 1343 to 1126
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Department  1102 non-null   object
dtypes: object(1)
memory usage: 17.2+ KB


In [9]:
y_Attrition_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1102 entries, 1343 to 1126
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Attrition  1102 non-null   object
dtypes: object(1)
memory usage: 17.2+ KB


In [10]:
# Convert your X data to numeric data types however you see fit
# Add new code cells as necessary
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
OverTime_encoder = LabelEncoder()
X_train['OverTime_encoded'] = OverTime_encoder.fit_transform(X_train['OverTime'])
X_train['OverTime_encoded'].value_counts()


OverTime_encoded
0    780
1    322
Name: count, dtype: int64

In [11]:
X_train =  X_train.drop(columns='OverTime')
X_train.head()

Unnamed: 0,Education,Age,DistanceFromHome,JobSatisfaction,StockOptionLevel,WorkLifeBalance,YearsAtCompany,YearsSinceLastPromotion,NumCompaniesWorked,OverTime_encoded
1343,3,29,7,1,0,3,3,1,3,0
1121,4,36,1,3,0,3,1,0,6,0
1048,3,34,3,1,0,3,13,3,3,0
1393,3,27,9,4,0,3,7,0,1,0
527,3,32,10,4,0,2,10,0,1,0


In [12]:
OverTime_encoder = LabelEncoder()
X_test['OverTime_encoded'] = OverTime_encoder.fit_transform(X_test['OverTime'])
X_test =  X_test.drop(columns='OverTime')
X_test['OverTime_encoded'].value_counts()

OverTime_encoded
0    274
1     94
Name: count, dtype: int64

In [13]:
# Create a StandardScaler
X_scaler = StandardScaler()

# Fit the StandardScaler to the training data
X_scaler.fit(X_train)

# Scale the training and testing data
X_test_scaled = X_scaler.transform(X_test)

In [None]:
y_d

In [14]:
Department_encoder = OneHotEncoder(sparse_output=False)
Department_encoded = Department_encoder.fit_transform(y_Department_train[['Department']])  
Department_columns = Department_encoder.get_feature_names_out(['Department'])
y_train_department_encoded = pd.DataFrame(Department_encoded, columns=Department_columns)
y_train_department_encoded
#Department_encoded = Department_encoder.fit_transform(y_df[['Department']])  # Corrected this line
#Department_columns = Department_encoder.get_feature_names_out(['Department'])



#y_quality_df = pd.DataFrame(y_quality, columns=ohe_quality.get_feature_names_out(['quality']))

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0.0,1.0,0.0
1,0.0,0.0,1.0
2,0.0,0.0,1.0
3,0.0,0.0,1.0
4,0.0,0.0,1.0
...,...,...,...
1097,0.0,1.0,0.0
1098,0.0,1.0,0.0
1099,0.0,1.0,0.0
1100,0.0,1.0,0.0


In [15]:
Department_encoder = OneHotEncoder(sparse_output=False)
Department_encoded = Department_encoder.fit_transform(y_Department_test[['Department']])  
Department_columns = Department_encoder.get_feature_names_out(['Department'])
y_test_department_encoded = pd.DataFrame(Department_encoded, columns=Department_columns)
y_test_department_encoded

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
363,0.0,1.0,0.0
364,0.0,1.0,0.0
365,0.0,1.0,0.0
366,1.0,0.0,0.0


In [17]:
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder for the Department column
Department_encoder = OneHotEncoder(sparse_output=False)

# Fit the encoder to the training data
Department_encoded = Department_encoder.fit_transform(y_df[['Department']])  # Corrected this line
Department_columns = Department_encoder.get_feature_names_out(['Department'])

# Create two new variables by applying the encoder
# to the training and testing data
X_train_department = pd.DataFrame(Department_encoded, columns=Department_columns)
#X_test = pd.DataFrame(Department_encoded, columns=Department_columns)
y_test_department_encoded = pd.DataFrame(Department_encoded, columns=Department_columns)
y_test_department_encoded

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
1465,0.0,1.0,0.0
1466,0.0,1.0,0.0
1467,0.0,1.0,0.0
1468,0.0,0.0,1.0


In [18]:
# Create a OneHotEncoder for the Attrition column
Department_encoder = OneHotEncoder(sparse_output=False)



# Fit the encoder to the training data
Attrition_encoded = Department_encoder.fit_transform(y_Attrition_train[['Attrition']])  
Attrition_columns = Department_encoder.get_feature_names_out(['Attrition'])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_Attrition_encoded = pd.DataFrame(Department_encoded, columns=Department_columns)
y_train_Attrition_encoded

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0
...,...,...,...
1465,0.0,1.0,0.0
1466,0.0,1.0,0.0
1467,0.0,1.0,0.0
1468,0.0,0.0,1.0


In [19]:
# Create a OneHotEncoder for the Attrition column
Attrition_encoder = OneHotEncoder(sparse_output=False)



# Fit the encoder to the training data
Attrition_encoded = Attrition_encoder.fit_transform(y_Attrition_train[['Attrition']])  
Attrition_columns = Attrition_encoder.get_feature_names_out(['Attrition'])

# Create two new variables by applying the encoder
# to the training and testing data
y_train_Attrition_encoded = pd.DataFrame(Attrition_encoded, columns=Attrition_columns)
y_train_Attrition_encoded

Unnamed: 0,Attrition_No,Attrition_Yes
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0
...,...,...
1097,1.0,0.0
1098,1.0,0.0
1099,0.0,1.0
1100,1.0,0.0


In [20]:
# Create a OneHotEncoder for the Attrition column
Attrition_encoder = OneHotEncoder(sparse_output=False)



# Fit the encoder to the training data
Attrition_encoded = Attrition_encoder.fit_transform(y_Attrition_test[['Attrition']])  
Attrition_columns = Attrition_encoder.get_feature_names_out(['Attrition'])

# Create two new variables by applying the encoder
# to the training and testing data
y_test_Attrition_encoded = pd.DataFrame(Attrition_encoded,columns=Attrition_columns)
y_test_Attrition_encoded

Unnamed: 0,Attrition_No,Attrition_Yes
0,1.0,0.0
1,1.0,0.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
...,...,...
363,1.0,0.0
364,1.0,0.0
365,1.0,0.0
366,1.0,0.0


## Part 2: Create, Compile, and Train the Model

In [21]:
# Find the number of columns in the X training data.
#col_len = len(x_columns)
col_len =X.shape[1]
timestep = 10


# Create the input layer
input_layer = layers.Input(shape=(timestep, col_len,),name='input_layer')

# Create at least two shared layers
shared_layer_1 = LSTM(32, return_sequences=True, name='shared_layer_1')(input_layer)
shared_layer_2 = LSTM(16, name='shared_layer_2')(shared_layer_1)  # final LSTM without return_sequences

In [22]:
# Create a branch for Department
# with a hidden layer and an output layer

# Create the hidden layer
hidden_layer_1 = layers.Dense(64, activation='relu', name='hidden_layer_1')(shared_layer_2)

# Create the output layer
dept_output = layers.Dense(3, activation='softmax', name='dept_output')(hidden_layer_1)

In [23]:
# Create a branch for Attrition
# with a hidden layer and an output layer

# Create the hidden layer

hidden_layer_2 = layers.Dense(64, activation='relu', name='hidden_layer_2')(shared_layer_2)
# Create the output layer
Attrition_output = layers.Dense(2, activation='sigmoid', name='Attrition_output')(hidden_layer_2)

In [24]:
# Create the model
nn_model = Model(inputs=input_layer, outputs={'Department': dept_output, 'Attrition_output': Attrition_output})

# Compile the model
nn_model.compile(optimizer='adam',
                 loss={'Department': 'categorical_crossentropy', 'Attrition_output': 'binary_crossentropy'},
                 metrics={'Department': 'mae', 'Attrition_output': 'accuracy'})

# Summarize the model
nn_model.summary()

In [25]:
# Train the model
nn_model.fit(
    X_train,
    {'Department': y_train_department_encoded, 'Attrition_output': y_train_Attrition_encoded},
    epochs=10)


Epoch 1/10


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_1/Cast:0", shape=(None, 10), dtype=float32). Expected shape (None, 10, 10), but input has incompatible shape (None, 10)[0m

Arguments received by Functional.call():
  • inputs=tf.Tensor(shape=(None, 10), dtype=int64)
  • training=True
  • mask=None

In [None]:
print(X_train.dtypes)
#print(y_train_department_encoded.dtype)
#print(y_train_Attrition_encoded.dtype)

In [None]:
# Evaluate the model with the testing data


In [None]:
# Print the accuracy for both department and attrition


# Summary

In the provided space below, briefly answer the following questions.

1. Is accuracy the best metric to use on this data? Why or why not?

2. What activation functions did you choose for your output layers, and why?

3. Can you name a few ways that this model might be improved?

YOUR ANSWERS HERE

1. 
2. 
3. 