In [1]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.model_selection import StratifiedKFold, train_test_split
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score
from keras import regularizers

Using TensorFlow backend.


In [2]:
labels = pd.read_csv('hr_test_labels.csv')
features = pd.read_csv('hr_test_features.csv')

In [44]:
df = pd.read_csv('hr_attrition.csv')
split_num = int(len(df)*.8)
df = df[split_num:]
len(df)

294

In [3]:
assert len(features) == len(labels)

In [4]:
len(features)

294

In [6]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
#y_test = pd.DataFrame(y_train)

In [7]:
y_train = pd.DataFrame(y_train)

In [6]:
num_classes = 1
input_shape = features.shape[1]

model = Sequential()
model.add(Dense(512, input_shape = (input_shape,), activation = 'tanh'))
model.add(Dropout(0.4))
model.add(Dense(1024, activation = 'tanh', kernel_initializer = 'truncated_normal',kernel_regularizer = regularizers.l2(.001)))#1024
#model.add(Dropout(0.6))
model.add(Dense(512, activation = 'relu', kernel_initializer = 'truncated_normal', kernel_regularizer = regularizers.l2(.001)))#1024
model.add(Dropout(0.5))
model.add(Dense(32, activation = 'relu', kernel_initializer = 'truncated_normal', kernel_regularizer = regularizers.l2(.001)))#1024
model.add(Dropout(0.3))
model.add(Dense(1, activation = 'sigmoid'))

In [7]:
checkpoint = ModelCheckpoint(filepath = 'hr_weights.hdf5', monitor = 'val_loss', save_best_only = True, verbose = False)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=20, min_lr=0.00001, verbose = 0)
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=60)

In [8]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               25088     
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_3 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                16416     
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
__________

In [84]:
history = model.fit(x_train, y_train, batch_size = 16, epochs = 75, validation_split = 0.25,
                    verbose = 1, callbacks = [checkpoint, reduce_lr, early_stop])

Train on 882 samples, validate on 294 samples
Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


In [15]:
preds = model.predict(features)

In [10]:
model.load_weights('hr_weights.hdf5')

In [16]:
model.evaluate(features, labels)



[0.31770410038986985, 0.891156462585034]

In [17]:
roc = roc_auc_score(labels, preds)
roc

0.88474010933012137

In [52]:
evals = [1 if i > .25 else 0 for i in preds]
cf = confusion_matrix(labels, evals)
cf

array([[226,  25],
       [ 14,  29]])

In [53]:
len(labels)

294

In [54]:
df['predictions'] = evals
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,predictions
1176,49,No,Travel_Rarely,301,Research & Development,22,4,Other,1,1655,...,80,2,27,2,3,4,2,1,2,0
1177,50,No,Travel_Rarely,813,Research & Development,17,5,Life Sciences,1,1656,...,80,3,19,3,3,14,11,1,11,0
1178,20,No,Travel_Rarely,1141,Sales,2,3,Medical,1,1657,...,80,0,2,3,3,2,2,2,2,0
1179,34,No,Travel_Rarely,1130,Research & Development,3,3,Life Sciences,1,1658,...,80,1,11,2,3,11,8,7,9,0
1180,36,No,Travel_Rarely,311,Research & Development,7,3,Life Sciences,1,1659,...,80,0,15,4,3,4,3,1,3,1


In [55]:
df.to_csv('hr_with_preds.csv', index = False)

In [57]:
len(df.query('predictions == 1'))

54