# Preprocessing Kredit Nasabah
Notebook reproducing the preprocessing pipeline.

## 1. Load data
Load the dataset from the provided Excel file.

In [None]:
import pandas as pd

df = pd.read_excel(r'/mnt/data/DATASET KREDIT NASABAH.xlsx')
df.head()

## 2. Inspect data
Basic info and missing values.

In [None]:
df.info()

df.isnull().sum()

## 3. Imputation (median for numeric, mode for categorical)

In [None]:
from sklearn.impute import SimpleImputer
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(include=['object']).columns
num_imp = SimpleImputer(strategy='median')
cat_imp = SimpleImputer(strategy='most_frequent')
df[num_cols] = num_imp.fit_transform(df[num_cols])
df[cat_cols] = cat_imp.fit_transform(df[cat_cols])
df.head()

## 4. Handling rare categories
Combine categories with frequency < 1% into 'Lain-Lain'

In [None]:
threshold = 0.01
for col in cat_cols:
    freqs = df[col].value_counts(normalize=True)
    rare = freqs[freqs < threshold].index.tolist()
    if rare:
        df[col] = df[col].apply(lambda x: 'Lain-Lain' if x in rare else x)
df.head()

## 5. One-Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)
enc = encoder.fit_transform(df[cat_cols])
enc_cols = encoder.get_feature_names_out(cat_cols)
df_enc = pd.concat([df.drop(columns=cat_cols).reset_index(drop=True), pd.DataFrame(enc, columns=enc_cols)], axis=1)
df_enc.head()

## 6. Outlier capping using IQR with multiplier 3 (Winsorization)

In [None]:
for col in df_enc.select_dtypes(include=['number']).columns:
    Q1 = df_enc[col].quantile(0.25)
    Q3 = df_enc[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 3*IQR
    upper = Q3 + 3*IQR
    df_enc[col] = df_enc[col].clip(lower=lower, upper=upper)
df_enc.describe()

## 7. Scaling (StandardScaler) - fit on train only

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X = df_enc.copy()
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
scaler = StandardScaler()
num_cols = X_train.select_dtypes(include=['number']).columns
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
X_train.head()

## 8. Save processed data and artifacts

In [None]:
X_train.to_csv('X_train_scaled.csv', index=False)
X_test.to_csv('X_test_scaled.csv', index=False)