In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.filterwarnings('ignore')

In [2]:
print("1. MEMUAT DATA DAN INSPEKSI AWAL...")
df = pd.read_csv('german_credit_data1.csv', delimiter=';')

# Drop kolom indeks yang tidak relevan
df.drop('Unnamed: 0', axis=1, inplace=True)
print(f"Dimensi data awal setelah drop indeks: {df.shape}")

# Definisikan kolom untuk diolah
NUMERICAL_COLS = ['Age', 'Credit amount', 'Duration']
CATEGORICAL_COLS_NOMINAL = ['Sex', 'Housing', 'Purpose']
CATEGORICAL_COLS_MISSING = ['Saving accounts', 'Checking account']
TARGET_COL = 'Job' # Asumsi: Kita ingin memprediksi 'Job' sebagai contoh target klasifikasi.

# Pisahkan fitur (X) dan target (y)
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

# Split Data (untuk simulasi Pra-pemrosesan I)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")

1. MEMUAT DATA DAN INSPEKSI AWAL...
Dimensi data awal setelah drop indeks: (1000, 9)
X_train shape: (800, 8), X_test shape: (200, 8)


In [3]:
print("\n2. MEMBANGUN PIPELINE TRANSFORMASI...")

# 2.1. Pipeline untuk Kolom Numerik
# Tidak ada missing values, hanya scaling
numerical_pipeline = Pipeline([
    # Standard Scaling
    ('scaler', StandardScaler())
])

# 2.2. Pipeline untuk Kolom Kategorikal dengan Missing Values (Saving & Checking accounts)
# Imputasi Mode (karena kategorikal), diikuti One-Hot Encoding
categorical_missing_pipeline = Pipeline([
    # Imputasi dengan Mode (nilai yang paling sering muncul)
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # One-Hot Encoding
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# 2.3. Pipeline untuk Kolom Kategorikal Nominal (Sex, Housing, Purpose)
# Tidak ada missing values, langsung One-Hot Encoding
categorical_nominal_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Gabungkan semua pipeline menggunakan ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, NUMERICAL_COLS),
        ('cat_missing', categorical_missing_pipeline, CATEGORICAL_COLS_MISSING),
        ('cat_nominal', categorical_nominal_pipeline, CATEGORICAL_COLS_NOMINAL)
    ],
    remainder='passthrough' # Biarkan kolom lain yang tidak masuk di pipeline
)


2. MEMBANGUN PIPELINE TRANSFORMASI...


In [4]:
print("\n3. FIT DAN TRANSFORM DATA...")

# Fit dan transform data latih
X_train_processed = preprocessor.fit_transform(X_train)

# Transform data uji (PENTING: gunakan preprocessor yang sudah di-fit pada data latih)
X_test_processed = preprocessor.transform(X_test)

# Dapatkan nama fitur setelah transformasi
feature_names = (NUMERICAL_COLS + 
                 list(preprocessor.named_transformers_['cat_missing']['onehot'].get_feature_names_out(CATEGORICAL_COLS_MISSING)) + 
                 list(preprocessor.named_transformers_['cat_nominal']['onehot'].get_feature_names_out(CATEGORICAL_COLS_NOMINAL)))

# Konversi kembali ke DataFrame untuk kemudahan inspeksi dan Pra-pemrosesan II
X_train_processed_df = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=feature_names)

# Pastikan target (y) tetap dalam bentuk Series
y_train_df = pd.Series(y_train).reset_index(drop=True)
y_test_df = pd.Series(y_test).reset_index(drop=True)

print(f"Dimensi X_train setelah preprocessing: {X_train_processed_df.shape}")
print(f"Fitur pertama setelah transformasi: {list(X_train_processed_df.columns[:5])}...")


3. FIT DAN TRANSFORM DATA...
Dimensi X_train setelah preprocessing: (800, 23)
Fitur pertama setelah transformasi: ['Age', 'Credit amount', 'Duration', 'Saving accounts_little', 'Saving accounts_moderate']...


In [None]:
print("\n4. PENYIMPANAN DATA...")

# Gabungkan fitur (X) dan target (y) untuk penyimpanan
train_final = pd.concat([X_train_processed_df, y_train_df.rename(TARGET_COL)], axis=1)
test_final = pd.concat([X_test_processed_df, y_test_df.rename(TARGET_COL)], axis=1)

# Simpan sebagai output Pra-pemrosesan I
train_final.to_csv('data_train_preprocessed.csv', index=False)
test_final.to_csv('data_test_preprocessed.csv', index=False)

print("--- PRA-PEMROSESAN I SELESAI ---")
print("Data siap untuk Pra-pemrosesan II (Feature Optimization):")
print("=> data_train_preprocessed.csv")
print("=> data_test_preprocessed.csv")


4. PENYIMPANAN DATA...
--- PRA-PEMROSESAN I SELESAI ---
Data siap untuk Pra-pemrosesan II (Feature Optimization):
=> data_train_preprocessed.csv
=> data_test_preprocessed.csv
