In [None]:
# Import library utama
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Optional: visualisasi lebih rapi
sns.set(style="whitegrid")

# Load dataset, otomatis menghilangkan header
data = pd.read_csv("../Data/raw/WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Tampilkan 5 baris pertama
data.head()

# Salin data asli agar aman
df = data.copy()

In [None]:
# Ukuran data set
df.shape

In [None]:
# Data type check
df.info()



In [None]:
# Data numerik
df.describe()

In [None]:
df.describe(include='object')

In [None]:
# Kolom numerik
num_cols = df.select_dtypes(include='number').columns.tolist()

# Kolom kategorikal
cat_cols = df.select_dtypes(include='object').columns.tolist()

num_cols, cat_cols

In [None]:
# Check missing value 
df.isnull().sum()

In [None]:
# Summary missing value
missing_summary = df.isnull().sum().reset_index()
missing_summary.columns = ['Column', 'MissingCount']
missing_summary

In [None]:
# Jumlah baris/kolom
# Tipe data tiap kolom
# Distribusi nilai (numerik & kategorikal)
# Kolom mana yang ada missing value
# Kolom numerik & kategorikal untuk analisis selanjutnya

In [None]:
# Distribusi numerik
num_cols = df.select_dtypes(include='number').columns.tolist()

for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Distribusi kategorikal
cat_cols = df.select_dtypes(include='object').columns.tolist()

for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col)
    plt.title(f'Count of {col}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
# Tabel korelasi numerik
plt.figure(figsize=(45,8))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Categorical vs Attrition
for col in cat_cols:
    if col != 'Attrition':
        plt.figure(figsize=(6,4))
        sns.countplot(x='Attrition', hue=col, data=df)
        plt.title(f'Attrition by {col}')
        plt.show()

# Numerical vs Attrition
for col in num_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x='Attrition', y=col, data=df)
    plt.title(f'{col} vs Attrition')
    plt.show()