In [0]:
import itertools
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use(['ggplot', 'seaborn'])


In [0]:
in_dir = 'data'

in_data = os.path.join(in_dir, 'employee-attrition.csv')

df = pd.read_csv(in_data)


In [0]:
df.info()


 there aren't any missing values and out target variable appears to be categorical.

In [0]:
df.describe().T


In [0]:
df.sample(10)


 There are only two possible values for the target variable and it is highly imbalanced, will need to balance it before training the model. Let us transform it into numeric.

In [0]:
data = df['Attrition'].value_counts()

_ = sns.barplot(data.index, data.values, palette='muted')


In [0]:
df.loc[df['Attrition'] == 'Yes', 'Attrition'] = 1
df.loc[df['Attrition'] == 'No', 'Attrition'] = 0


 Let us check correlation between variables.

In [0]:
corr = df.corr()

fig, ax = plt.subplots(figsize=(15, 15))

sns.heatmap(corr, cmap='coolwarm', annot=True, fmt='.2f', linewidths=.5, ax=ax)


In [0]:
abs(corr['Attrition']) > 0.5


 There don't seem to be high correlation between any of the variables and the target one but some features are highly correlated with each other and worth investigating more to see if they can be dropped. In particular:
 - JobLevel almost has perfect correlation with MonthlyIncome
 - EmployeeCount and StandardHours have the same number in it and can probably be dropped from the dataset.
 - Age higly correlates with JobLevel, MonthlyIncome and TotalWorkingYears
 - JobLevel highly correlates with TotalWorkingYears and YearsAtCompany
 - MonthlyIncome highly correlates with TotalWorkingYears and YearsAtCompany
 - PercentSalaryHike highly correlates with PerformanceRating
 Let us check the categorical features

In [0]:
df.describe(include=['O'])


 It appears that Over18 only have one value and can be dropped from the dataset.

In [0]:
to_drop = ['EmployeeCount', 'StandardHours', 'Over18', 'EmployeeNumber']

df.drop(columns=to_drop, inplace=True)


In [0]:


def PlotDists(feature, position):
    '''
    '''
    g = sns.factorplot(x=feature, y='Attrition',
                       data=df, palette='muted', kind='bar', size=6, ax=position)

    g.despine(left=True)

    g = g.set_ylabels('Attrition probability')



In [0]:
to_plot = ['BusinessTravel', 'Department', 'EducationField', 'Gender',
           'JobRole', 'MaritalStatus', 'OverTime']

fig, ax = plt.subplots(4, 2, figsize=(20, 20), sharex=False, sharey=False)

# Flatten out the axis object
ax = ax.ravel()

for i in range(7):

    plt.sca(ax[i])

    g = sns.factorplot(x=to_plot[i], y='Attrition',
                       data=df, palette='muted', kind='bar', size=6, ax=ax[i])

    g.despine(left=True)

    g = g.set_ylabels('Attrition probability')

    #PlotDists(to_plot[i], positions[i])


In [0]:

list(enumerate(ax.reshape(-1)))


In [0]:
axs.ravel()

