In [35]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


In [2]:
#Read in the data files
data = pd.read_csv("../data/general_data.csv",sep=",")

In [5]:
data.shape

(4410, 29)

In [25]:
data = data.dropna()
data.shape

(4329, 29)

In [28]:
# Attrition is dependent var
label_encoder_y=LabelEncoder()
data['Attrition']=label_encoder_y.fit_transform(data['Attrition'])

In [29]:
#Bin the age variable into buckets for easier EDA
data['Age_Bin']=pd.cut(x = data['Age'],
                        bins = [0,30,40,50,60],
                        labels = [0, 1, 2,3])

In [44]:
#Drop unnecessary columns
data.drop(['EmployeeCount','EmployeeID','StandardHours','Over18'],axis=1,inplace=True)

In [45]:
#Convert the Categorical Variables to dummy variables. We use the drop_first=True option to eliminate the first
df = pd.get_dummies(data, columns=['Gender', 'JobRole', 'BusinessTravel','Education','EducationField','JobLevel','MaritalStatus','Department','StockOptionLevel'],drop_first=True)

In [46]:
# Save cleaned df for later
df.to_csv(r'../data/df.csv')


In [47]:
X = df.drop(columns='Attrition')
y = df['Attrition']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
#set the SMOTE up
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)

#Notice that the shape of X before and after rebalancing.
#The data frame has grown to include more cases of where employee attrition = 1.
print(f'''Shape of X before SMOTE: {X_train.shape}
Shape of X after SMOTE: {X_sm.shape}''')

#Check that response variable is now balanced (i.e. 50% of the cases are when attrition = 1)
print(f'''Balance of y variable: {y_sm.mean()}''')

Shape of X before SMOTE: (2900, 47)
Shape of X after SMOTE: (4850, 47)
Balance of y variable: 0.5


In [54]:
#Remerge the data
tr_sm = pd.concat([y_sm, X_sm], axis=1)
test = pd.concat([y_test, X_test], axis=1)

print(f'''train shape: {tr_sm.shape}
test shape: {test.shape}''')

train shape: (4850, 48)
test shape: (1429, 48)


In [56]:
tr_sm.to_csv(r'../data/train.csv')
test.to_csv(r'../data/test.csv')