# Eksperimen MSML - EkoAndriPrasetyo

Dataset raw: `creditscoring_raw/creditscoring_raw.csv`

Output preprocessing: `preprocessing/creditscoring_preprocessing/creditscoring_preprocessed.csv`

Notebook ini berisi:
1. Data Loading
2. EDA
3. Preprocessing (imputasi)
4. Save dataset preprocessed


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

RAW_PATH = Path('../creditscoring_raw/creditscoring_raw.csv')
OUT_PATH = Path('./creditscoring_preprocessing/creditscoring_preprocessed.csv')
print('RAW_PATH:', RAW_PATH)
print('Exists:', RAW_PATH.exists())


## 1) Data Loading


In [None]:
df = pd.read_csv(RAW_PATH)
df.head()


## 2) EDA


In [None]:
df.info()
df.describe(include='all').T.head(30)


In [None]:
missing = df.isna().sum().sort_values(ascending=False)
missing[missing>0]


In [None]:
df.duplicated().sum()


In [None]:
df['target'].value_counts(dropna=False)


In [None]:
num_cols = ['age','monthly_income','loan_amount','tenure_months','num_credit_lines','has_previous_default']
cat_cols = ['job_type','education_level','city','marital_status']
df[num_cols].hist(figsize=(10,6))
plt.tight_layout(); plt.show()


## 3) Preprocessing

Imputasi:
- Numerik: median
- Kategorikal: most_frequent

One-hot encoding dilakukan di modelling (Pipeline), agar model bisa menerima input raw saat serving.


In [None]:
from sklearn.impute import SimpleImputer

out = df.copy()
out[num_cols] = SimpleImputer(strategy='median').fit_transform(out[num_cols])
out[cat_cols] = SimpleImputer(strategy='most_frequent').fit_transform(out[cat_cols])
out['target'] = out['target'].astype(int)
for c in cat_cols:
    out[c] = out[c].astype(str)
out.head()


## 4) Save Dataset Preprocessed


In [None]:
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False)
print('Saved:', OUT_PATH, 'exists:', OUT_PATH.exists())
