In [1]:
# model_training.py
"""
Script untuk training model churn prediction
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

def train_direct_model(X_train, X_test, y_train, y_test):
    """
    Training model tanpa preprocessing
    """
    print("ðŸ¤– Direct Modeling (tanpa preprocessing)...")
    
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
        'Voting Classifier': VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(random_state=42, max_iter=1000)),
                ('rf', RandomForestClassifier(random_state=42, n_estimators=100)),
                ('svc', SVC(probability=True, random_state=42))
            ],
            voting='soft'
        )
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nðŸ“Š Training {name}...")
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred)
        }
        
        results[name] = metrics
        
        print(f"   âœ… Accuracy: {metrics['accuracy']:.4f}")
        print(f"   âœ… F1-Score: {metrics['f1']:.4f}")
    
    return results

def train_preprocessed_model(X_train, X_test, y_train, y_test):
    """
    Training model dengan data preprocessed
    """
    print("ðŸ¤– Modeling dengan preprocessing...")
    
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
        'Voting Classifier': VotingClassifier(
            estimators=[
                ('lr', LogisticRegression(random_state=42, max_iter=1000)),
                ('rf', RandomForestClassifier(random_state=42, n_estimators=100)),
                ('svc', SVC(probability=True, random_state=42))
            ],
            voting='soft'
        )
    }
    
    results = {}
    
    for name, model in models.items():
        print(f"\nðŸ“Š Training {name}...")
        
        # Train model
        model.fit(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred)
        }
        
        results[name] = metrics
        
        print(f"   âœ… Accuracy: {metrics['accuracy']:.4f}")
        print(f"   âœ… F1-Score: {metrics['f1']:.4f}")
    
    return results

def tune_random_forest(X_train, y_train):
    """
    Hyperparameter tuning untuk Random Forest
    """
    print("ðŸ”§ Hyperparameter Tuning untuk Random Forest...")
    
    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # Create model
    rf = RandomForestClassifier(random_state=42)
    
    # Grid Search
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,
        scoring='f1',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    print(f"âœ… Best parameters: {grid_search.best_params_}")
    
    return grid_search.best_estimator_

def evaluate_model(model, X_test, y_test, model_name="Model"):
    """
    Evaluasi model
    """
    # Predict
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"\nðŸ“Š Evaluasi {model_name}:")
    print(f"   âœ… Accuracy:  {accuracy:.4f}")
    print(f"   âœ… Precision: {precision:.4f}")
    print(f"   âœ… Recall:    {recall:.4f}")
    print(f"   âœ… F1-Score:  {f1:.4f}")
    
    # Visualize confusion matrix
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f'images/confusion_matrix_{model_name.replace(" ", "_")}.png', dpi=300)
    plt.show()
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def save_model(model, filename='model.pkl'):
    """
    Simpan model ke file
    """
    joblib.dump(model, filename)
    print(f"ðŸ’¾ Model disimpan sebagai: {filename}")

def load_model(filename='model.pkl'):
    """
    Load model dari file
    """
    return joblib.load(filename)

if __name__ == "__main__":
    print("Testing model_training.py...")
    
    # Contoh data dummy
    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Test fungsi
    results = train_direct_model(X_train, X_test, y_train, y_test)
    print("\nResults:", results)

Testing model_training.py...
ðŸ¤– Direct Modeling (tanpa preprocessing)...

ðŸ“Š Training Logistic Regression...
   âœ… Accuracy: 0.8550
   âœ… F1-Score: 0.8557

ðŸ“Š Training Random Forest...
   âœ… Accuracy: 0.9000
   âœ… F1-Score: 0.9020

ðŸ“Š Training Voting Classifier...
   âœ… Accuracy: 0.8800
   âœ… F1-Score: 0.8824

Results: {'Logistic Regression': {'accuracy': 0.855, 'precision': 0.9148936170212766, 'recall': 0.8037383177570093, 'f1': 0.8557213930348259}, 'Random Forest': {'accuracy': 0.9, 'precision': 0.9484536082474226, 'recall': 0.8598130841121495, 'f1': 0.9019607843137255}, 'Voting Classifier': {'accuracy': 0.88, 'precision': 0.9278350515463918, 'recall': 0.8411214953271028, 'f1': 0.8823529411764706}}
