# EDA

In [38]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.io as pio
pio.templates.default = "simple_white"

In [113]:
# load full training set
df = pd.read_pickle(r"train_enc.pkl")

In [115]:
# Look ate the data
df.head(10)

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
4408,sales,0,0.853635,4,medium,5,0.259219,0,180.178543,1
5368,retail,0,0.661567,4,medium,6,0.311056,0,183.607849,0
4920,retail,0,0.591863,3,low,7,0.277298,0,185.288177,0
8402,engineering,0,0.65941,3,medium,7,0.698308,0,187.38562,1
5617,retail,0,0.653639,2,medium,6,0.219496,0,183.54158,0
8566,operations,0,0.697675,3,medium,7,0.761181,0,186.953751,1
4573,engineering,0,0.616492,4,medium,7,0.423596,0,186.499908,1
1497,sales,0,0.646672,3,high,6,0.573886,0,183.450073,0
7632,sales,0,0.601003,3,low,8,0.330493,0,189.882477,0
8877,engineering,0,0.579024,3,medium,8,0.541972,1,188.082138,1


In [20]:
# Is there any null values?
df.isnull().any()

department       False
promoted         False
review           False
projects         False
salary           False
tenure           False
satisfaction     False
bonus            False
avg_hrs_month    False
left             False
dtype: bool

In [8]:
# Get variable names
bi_vars = [col for col in df.columns if df[col].nunique() == 2]
num_vars = [col for col in df.columns if (df[col].dtype in ["int8", "float32"]) & (col not in bi_vars)]
cat_vars = [col for col in df.columns if (col not in num_vars) & (col not in bi_vars)]

In [9]:
# Summary for numeric variables
df[num_vars].describe()

Unnamed: 0,review,projects,tenure,satisfaction,avg_hrs_month
count,7632.0,7632.0,7632.0,7632.0,7632.0
mean,0.651796,3.274764,6.549528,0.504544,184.643692
std,0.085569,0.579746,1.418621,0.158432,4.158738
min,0.31,2.0,2.0,0.0,172.281433
25%,0.5927,3.0,5.0,0.386755,181.44352
50%,0.64715,3.0,7.0,0.500999,184.6353
75%,0.708725,4.0,8.0,0.621817,187.720585
max,1.0,5.0,11.0,1.0,198.582245


In [116]:
df.left = df.left.map({1:"Yes",0:"No"})
df.bonus = df.bonus.map({1:"Yes",0:"No"})
df.promoted = df.promoted.map({1:"Yes",0:"No"})

### Target distribution

In [59]:
# Here we use a column with categorical data
fig = px.histogram(df, x="left",color="left", category_orders=dict(left=["Yes", "No"]), width=600, color_discrete_sequence=["#ff4242", "#adb5bd"])
fig.layout.title = "Attrition distribution"
fig.show()

## Univariate disitributions

### Categorical distributions

In [158]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Bonus", "Promoted", "Department", "Salary"))

fig.add_trace(go.Histogram(x=df["bonus"]),
              row=1, col=1)

fig.add_trace(go.Histogram(x=df["promoted"]),
              row=1, col=2)

fig.add_trace(go.Histogram(x=df["department"]),
              row=2, col=1)

fig.add_trace(go.Histogram(x=df["salary"]),
              row=2, col=2)

fig.update_layout(height=500, width=700,
                  title_text="Categorical distributions", showlegend=False)

fig.show()

### Numerical distributions

In [163]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go


fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=("tenure", "projects", "review", "satisfaction", "avg_hrs_month"))

fig.add_trace(go.Histogram(x=df["tenure"]),
              row=1, col=1)

fig.add_trace(go.Histogram(x=df["projects"]),
              row=1, col=2)

fig.add_trace(go.Histogram(x=df["review"]),
              row=1, col=3)

fig.add_trace(go.Histogram(x=df["satisfaction"]),
              row=2, col=1)

fig.add_trace(go.Histogram(x=df["avg_hrs_month"]),
              row=2, col=2)

fig.update_layout(height=500, width=800,
                  title_text="Numerical distributions", showlegend=False)

fig.show()