# Employee Attrition - Exploratory Data Analysis

This notebook explores the IBM HR Analytics Employee Attrition dataset to understand patterns and factors influencing employee turnover.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from src.ingest import download_dataset

df = download_dataset()
print(f"Dataset shape: {df.shape}")
df.head()

## Dataset Overview

In [None]:
print("Data types:")
print(df.dtypes.value_counts())
print(f"\nMissing values: {df.isna().sum().sum()}")
print(f"\nAttrition distribution:")
print(df['Attrition'].value_counts(normalize=True))
df.describe()

## Attrition Rate Analysis

In [None]:
fig = px.pie(df, names='Attrition', title='Overall Attrition Distribution',
             color='Attrition', color_discrete_map={'Yes': '#E74C3C', 'No': '#1E3A5F'},
             hole=0.4)
fig.show()

In [None]:
# Attrition by department
dept = df.groupby('Department')['Attrition'].apply(lambda x: (x=='Yes').mean()).reset_index()
dept.columns = ['Department', 'Attrition Rate']
fig = px.bar(dept, x='Department', y='Attrition Rate', title='Attrition Rate by Department',
             color='Attrition Rate', color_continuous_scale=['#27AE60', '#F39C12', '#E74C3C'])
fig.update_layout(yaxis_tickformat='.0%')
fig.show()

## Key Factor Analysis

In [None]:
# Overtime impact
ot = df.groupby('OverTime')['Attrition'].apply(lambda x: (x=='Yes').mean()).reset_index()
ot.columns = ['OverTime', 'Attrition Rate']
fig = px.bar(ot, x='OverTime', y='Attrition Rate', title='Attrition Rate by Overtime',
             color='OverTime', color_discrete_map={'Yes': '#E74C3C', 'No': '#27AE60'})
fig.update_layout(yaxis_tickformat='.0%')
fig.show()

In [None]:
# Income distribution by attrition
fig = px.box(df, x='Attrition', y='MonthlyIncome', title='Monthly Income by Attrition',
             color='Attrition', color_discrete_map={'Yes': '#E74C3C', 'No': '#1E3A5F'})
fig.show()

In [None]:
# Age distribution
fig = px.histogram(df, x='Age', color='Attrition', barmode='overlay',
                   title='Age Distribution by Attrition',
                   color_discrete_map={'Yes': '#E74C3C', 'No': '#1E3A5F'},
                   opacity=0.7)
fig.show()

In [None]:
# Satisfaction factors
satisfaction_cols = ['JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance']
for col in satisfaction_cols:
    sat = df.groupby(col)['Attrition'].apply(lambda x: (x=='Yes').mean()).reset_index()
    sat.columns = [col, 'Attrition Rate']
    fig = px.bar(sat, x=col, y='Attrition Rate', title=f'Attrition Rate by {col}',
                 color='Attrition Rate', color_continuous_scale=['#27AE60', '#F39C12', '#E74C3C'])
    fig.update_layout(yaxis_tickformat='.0%')
    fig.show()

## Correlation Analysis

In [None]:
# Numeric correlation with attrition
df_numeric = df.copy()
df_numeric['Attrition_Binary'] = (df_numeric['Attrition'] == 'Yes').astype(int)
correlations = df_numeric.select_dtypes(include='number').corr()['Attrition_Binary'].drop('Attrition_Binary').sort_values()

fig = px.bar(x=correlations.values, y=correlations.index, orientation='h',
             title='Feature Correlation with Attrition',
             labels={'x': 'Correlation', 'y': 'Feature'},
             color=correlations.values,
             color_continuous_scale=['#1E3A5F', '#F0F2F6', '#E74C3C'],
             color_continuous_midpoint=0)
fig.update_layout(height=600)
fig.show()