In [None]:
# --- 0) Imports & setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", 100)
sns.set()  # simple default style

# --- 1) Load data
# Put your file at: data/employee_attrition.csv
CSV_PATH = "../data/employee_attrition.csv"  # adjust if your notebook sits elsewhere
df = pd.read_csv(CSV_PATH)

print("Shape:", df.shape)
display(df.head())
display(df.dtypes)

# --- 2) Basic data health checks
missing_pct = df.isna().mean().sort_values(ascending=False)
print("Missing % (top 10):")
display((missing_pct * 100).round(2).head(10))

# --- 3) Make sure we have an Attrition column and normalize its values
# Common names: 'Attrition', 'attrition', 'Left', 'left', 'Attrition_Flag', etc.
possible_cols = ["Attrition", "attrition", "Left", "left", "Attrition_Flag", "attrition_flag", "Exited", "exited"]
attr_col = next((c for c in possible_cols if c in df.columns), None)

if attr_col is None:
    raise ValueError(f"Could not find an attrition/exit column. Columns: {list(df.columns)[:20]} ...")

# Normalize to Yes/No strings for readability
def to_yes_no(x):
    s = str(x).strip().lower()
    if s in {"yes","y","true","1"}: return "Yes"
    if s in {"no","n","false","0"}:  return "No"
    return str(x)  # leave as is if already 'Yes'/'No' or other labels

df["Attrition_norm"] = df[attr_col].apply(to_yes_no)

# --- 4) Class balance
counts = df["Attrition_norm"].value_counts(dropna=False)
rates  = df["Attrition_norm"].value_counts(normalize=True, dropna=False).mul(100).round(2)

print("\nAttrition counts:")
display(counts)
print("\nAttrition rates (%):")
display(rates)

# Bar plot of class balance
plt.figure(figsize=(5,4))
sns.barplot(x=counts.index, y=counts.values)
plt.title("Attrition Class Balance")
plt.xlabel("Attrition")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# --- 5) A couple of quick EDA plots (optional)
# Numeric distribution example
num_col = next((c for c in ["Age","age","MonthlyIncome","monthly_income","Tenure","tenure"] if c in df.columns), None)
if num_col:
    plt.figure(figsize=(6,4))
    sns.histplot(df[num_col].dropna(), bins=30)
    plt.title(f"Distribution of {num_col}")
    plt.tight_layout()
    plt.show()

# Attrition rate by a common categorical column
cat_col = next((c for c in ["Department","department","JobRole","job_role"] if c in df.columns), None)
if cat_col:
    # compute rate per category
    tmp = (df.groupby(cat_col)["Attrition_norm"]
             .apply(lambda s: (s=="Yes").mean())
             .reset_index(name="attrition_rate"))
    plt.figure(figsize=(8,4))
    sns.barplot(data=tmp, x=cat_col, y="attrition_rate")
    plt.title(f"Attrition Rate by {cat_col}")
    plt.ylabel("Rate")
    plt.xticks(rotation=30, ha="right")
    plt.tight_layout()
    plt.show()
