In [18]:
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
from keras.utils import to_categorical
from keras.optimizers import SGD
from sklearn.model_selection import StratifiedKFold, train_test_split
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_auc_score
from keras import regularizers

In [25]:
df = pd.read_csv('hr_attrition.csv')
#ibm_df = pd.read_csv('IBM_HR.csv')

In [26]:
training = df['TrainingTimesLastYear']
training.describe()

count    1470.000000
mean        2.799320
std         1.289271
min         0.000000
25%         2.000000
50%         3.000000
75%         3.000000
max         6.000000
Name: TrainingTimesLastYear, dtype: float64

In [27]:
to_drop = ['Attrition', 'EmployeeCount','Over18', 'StandardHours', 'EmployeeNumber',]
labels = ['Attrition']

wages = ['DailyRate', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate']

to_dummy = ['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'OverTime']##7

categorical = ['Education', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##9

to_continous = ['Age', 'DistanceFromHome', 'HourlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
             'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole', 
             'YearsSinceLastPromotion', 'YearsWithCurrManager', 'Education', 'EnvironmentSatisfaction',
                'JobInvolvement', 'JobLevel', 'JobSatisfaction', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'WorkLifeBalance'] ##11

In [28]:
df.columns.values

array(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'Over18', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'], dtype=object)

In [30]:
###Get labels
def get_labels(df):
    labels = df['Attrition']
    labels = [1 if i == 'Yes' else 0 for i in labels]
    
    return labels

labels = get_labels(df)

In [32]:
###Drop labels and others

to_drop = ['DailyRate', 'MonthlyIncome', 'MonthlyRate', 'Attrition',
           'EmployeeCount','Over18', 'StandardHours', 'EmployeeNumber']

def drop_from_df(df, to_drop):
    df = df.drop(to_drop, axis = 1)
    
    return df

df = drop_from_df(df, to_drop)

In [33]:
df.columns.values

array(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'], dtype=object)

In [34]:
###Get Dummies and drop dummy features
def get_dummies(df):
    dummies = pd.get_dummies(df[to_dummy])
    
    #print(dummies)
    df = df.drop(to_dummy, axis = 1)
    
    frames = [df, dummies]
    df = pd.concat(frames, axis = 1)
    
    return df

df_with_dummies = get_dummies(df)

In [35]:
all_columns = df_with_dummies.columns.values

In [36]:
###Scale continous variables
def get_scaled(df, continous):
    scaler = MinMaxScaler((0.05, 0.95))
    df[continous] = scaler.fit_transform(df[continous])
    
    return df

dummy_scaled_df = get_scaled(df_with_dummies, to_continous)
#all_scaled = get_scaled(df_with_dummies, all_columns)

In [42]:
dummy_scaled_df[:2]

Unnamed: 0,Age,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,NumCompaniesWorked,PercentSalaryHike,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0.542857,0.05,0.275,0.35,0.872857,0.65,0.275,0.95,0.85,0.05,...,0,0,0,1,0,0,0,1,0,1
1,0.714286,0.275,0.05,0.65,0.448571,0.35,0.275,0.35,0.15,0.821429,...,0,0,1,0,0,0,1,0,1,0


In [41]:
labels = pd.DataFrame(labels)
features = dummy_scaled_df.copy()
labels.to_csv('hr_labels.csv', index = False)
features.to_csv('hr_features.csv', index = False)