In [None]:
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv', index_col = ['EmployeeNumber'])
df.head()

In [None]:
#quick description of dataset
df.describe()

In [None]:
#check if any values in the dataset are null
df.isnull().sum()

In [None]:
df.dtypes

### Create a Visualization of employee attrition rates

In [None]:
ncount = len(df)
plt.figure(figsize=(12,8))
ax = sns.countplot(x ="Attrition", data=df,palette= "husl")
plt.title('% of Employees Who Engage in Attrition')
plt.xlabel('Attrition')

for p in ax.patches:
    x=p.get_bbox().get_points()[:,0]
    y=p.get_bbox().get_points()[1,1]
    ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
            ha='center', va='bottom') # set the alignment of the text

### Create a heat map to check if there is any correlation between independent variables

In [None]:
df_num = df.select_dtypes(['int64'])
#drop employeecount and standard hours as values are always 1 and 80
df_num.drop(['EmployeeCount', 'StandardHours'], axis=1, inplace=True)
df_num

In [None]:
plt.figure(figsize= (8,8), dpi=100)
sns.heatmap(df_num.corr(), cmap="twilight_shifted")

### As years at company, years in current role, years since last promotion, and years with current manager are all heavily correlated with each other, we will be removing them from the DataFrame to handle multicollinearity.

In [None]:
df_mc= df_num.copy()
df_mc.drop(['YearsAtCompany' , 'YearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager'], axis = 1, inplace=True)
df_mc

In [None]:
#rejoin two dataframes together
df_obj = df.select_dtypes('object')
df_data = df_obj.join(df_mc, on = 'EmployeeNumber')
df_data

In [None]:
att_score = {'Yes':1, 'No':0}
df_data['Attrition'] = df_data['Attrition'].apply(lambda x:att_score[x])
df_data.head()

In [None]:
df_data = pd.get_dummies(df_data)
df_data.head()

In [None]:
target = df_data['Attrition']
features = df_data.drop('Attrition', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=10)