<a href="https://colab.research.google.com/github/firmanmaulana123/eda-boston-housing/blob/main/AI_DESSECIONTREE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
from google.colab import drive
drive.mount('/content/drive')

!pip install --upgrade numpy==1.23.5 scipy==1.9.3 pandas==1.5.3
!pip install xgboost==1.7.3 --no-cache-dir
!pip install scikit-learn==1.2.2 imbalanced-learn==0.10.1 seaborn matplotlib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

# Ganti path berikut dengan path CSV kamu di Google Drive
path_csv = '/content/drive/MyDrive/Datasets/WA_Fn-UseC_-HR-Employee-Attrition.csv'

# Load dataset
df = pd.read_csv(path_csv)
print(f"Data loaded: {df.shape[0]} rows, {df.shape[1]} columns")
sns.countplot(x='Attrition', data=df)
plt.title('Distribusi Karyawan berdasarkan Status Attrition')
plt.show()

# Initial preprocessing
df['Attrition'] = df['Attrition'].map({'Yes':1, 'No':0})
df = df.drop(['EmployeeNumber', 'Over18', 'StandardHours'], axis=1)

df.head()

# Attrition distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='Attrition')
plt.title("Attrition Distribution (Imbalanced Dataset)")
plt.show()

# Attrition by Department
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x='Department', hue='Attrition')
plt.title("Attrition by Department")
plt.show()

# Correlation analysis using pure pandas
corr_matrix = df.corr(numeric_only=True)[['Attrition']].sort_values('Attrition', ascending=False)
corr_matrix.style.background_gradient(cmap='coolwarm')

# Label encode categoricals
cat_cols = df.select_dtypes(include='object').columns
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Create meaningful numerical features
df['YearsSinceLastPromotion'] = df['YearsAtCompany'] - df['YearsSinceLastPromotion']
df['IncomeToAgeRatio'] = df['MonthlyIncome'] / (df['Age'] + 1)
df['TenureToAgeRatio'] = df['TotalWorkingYears'] / (df['Age'] + 1)

# Safe binning with NaN handling
df['AgeGroup'] = pd.cut(df['Age'], bins=[18,30,40,50,60], labels=False).fillna(-1).astype(int)
df['IncomeGroup'] = pd.qcut(df['MonthlyIncome'], q=4, labels=False, duplicates='drop').fillna(-1).astype(int)

# Verify all columns are numerical and no NaNs
print("Missing values per column:")
print(df.isnull().sum())

print("\nData types after processing:")
print(df.dtypes)

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# Train-test split
X = df.drop('Attrition', axis=1)
y = df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# CPU parameters
params = {
    'max_depth': 3,
    'learning_rate': 0.1,
    'n_estimators': 100,
    'tree_method': 'hist',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42
}

# class weights to XGBoost
params['scale_pos_weight'] = len(y_train[y_train==0])/len(y_train[y_train==1])  # ~5.2

model = XGBClassifier(**params)
model.fit(X_train, y_train)
print("Model trained successfully")

from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:,1]

print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"\n AUC-ROC Score: {roc_auc_score(y_test, y_proba):.2f}")

# Feature importance
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)
print("\n Top 5 Features:")
print(importance.head(5))

# Basic confusion matrix
pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])

import plotly.express as px

# Simple interactive visualization
fig = px.scatter(
    df,
    x='MonthlyIncome',
    y='JobSatisfaction',
    color='Attrition',
    title='Income vs Satisfaction by Attrition Status'
)
fig.show()

# Feature importance plot
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

px.bar(importance, x='Importance', y='Feature', title='Feature Importance')



MessageError: Error: credential propagation was unsuccessful