In [1]:
# data_processing.py
"""
Script untuk preprocessing data churn prediction
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

def load_data():
    """
    Load dataset dari URL
    """
    print("ðŸ“¥ Memuat dataset...")
    url = "https://raw.githubusercontent.com/IBM/telco-customer-churn-on-icp4d/master/data/Telco-Customer-Churn.csv"
    df = pd.read_csv(url)
    print(f"âœ… Dataset dimuat: {df.shape[0]} baris, {df.shape[1]} kolom")
    return df

def clean_data(df):
    """
    Membersihkan data
    """
    df_clean = df.copy()
    
    # Konversi TotalCharges ke numeric
    df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
    
    # Handle missing values
    if df_clean['TotalCharges'].isnull().sum() > 0:
        df_clean['TotalCharges'] = df_clean['TotalCharges'].fillna(df_clean['TotalCharges'].median())
        print(f"ðŸ”„ Imputed missing values in TotalCharges")
    
    # Encode target variable
    df_clean['Churn'] = df_clean['Churn'].map({'No': 0, 'Yes': 1})
    
    return df_clean

def create_preprocessor(X):
    """
    Membuat preprocessor pipeline
    """
    # Identifikasi kolom
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    
    print(f"ðŸ”§ Membuat preprocessor:")
    print(f"   - Kolom numerik: {numeric_cols}")
    print(f"   - Kolom kategorikal: {categorical_cols}")
    
    # Buat preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
        ]
    )
    
    return preprocessor

def prepare_training_data(df):
    """
    Menyiapkan data untuk training
    """
    from sklearn.model_selection import train_test_split
    
    # Pisahkan features dan target
    X = df.drop(['customerID', 'Churn'], axis=1)
    y = df['Churn']
    
    # Buat preprocessor
    preprocessor = create_preprocessor(X)
    
    # Transform data
    X_transformed = preprocessor.fit_transform(X)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_transformed, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"ðŸ“Š Data split:")
    print(f"   X_train: {X_train.shape}")
    print(f"   X_test: {X_test.shape}")
    print(f"   y_train: {y_train.shape}")
    print(f"   y_test: {y_test.shape}")
    
    return X_train, X_test, y_train, y_test, preprocessor

def save_preprocessor(preprocessor, filename='preprocessor.pkl'):
    """
    Simpan preprocessor ke file
    """
    joblib.dump(preprocessor, filename)
    print(f"ðŸ’¾ Preprocessor disimpan sebagai: {filename}")

def load_preprocessor(filename='preprocessor.pkl'):
    """
    Load preprocessor dari file
    """
    return joblib.load(filename)

if __name__ == "__main__":
    # Contoh penggunaan
    print("Testing data_processing.py...")
    df = load_data()
    df_clean = clean_data(df)
    X_train, X_test, y_train, y_test, preprocessor = prepare_training_data(df_clean)
    save_preprocessor(preprocessor)

Testing data_processing.py...
ðŸ“¥ Memuat dataset...
âœ… Dataset dimuat: 7043 baris, 21 kolom
ðŸ”„ Imputed missing values in TotalCharges
ðŸ”§ Membuat preprocessor:
   - Kolom numerik: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
   - Kolom kategorikal: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
ðŸ“Š Data split:
   X_train: (5634, 30)
   X_test: (1409, 30)
   y_train: (5634,)
   y_test: (1409,)
ðŸ’¾ Preprocessor disimpan sebagai: preprocessor.pkl
