In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
from sklearn.preprocessing import OneHotEncoder

# Load the saved data
data = pd.read_csv("processed_rfms_data.csv")




In [2]:
# Handle missing values
data.fillna(data.median(numeric_only=True), inplace=True)

# Encode categorical variables
categorical_cols = data.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_data = pd.DataFrame(encoder.fit_transform(data[categorical_cols]),
                            columns=encoder.get_feature_names_out(categorical_cols),
                            index=data.index)

In [None]:
# Drop categorical columns and reset index
data_dropped = data.drop(columns=categorical_cols).reset_index(drop=True)

# Reset index of encoded data
encoded_data = encoded_data.reset_index(drop=True)

# Ensure row counts match
if data_dropped.shape[0] != encoded_data.shape[0]:
    raise ValueError("Mismatch in row count between data and encoded_data")

# Merge the datasets
data = pd.concat([data_dropped, encoded_data], axis=1)

# Verify the shape of the resulting dataset
print(f"Final dataset shape: {data.shape}")


In [None]:
# Merge the encoded categorical data back into the dataset
data = pd.concat([data.drop(columns=categorical_cols), encoded_data], axis=1)

In [4]:
# Define the target and features
X = data.drop(columns=['DefaultLabel'])  # Features
y = data['DefaultLabel']  # Target (Good/Bad classification)

# Encode target labels (if not already numerical)
y = y.map({'Good': 1, 'Bad': 0})  # Convert to binary format

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [None]:
# Hyperparameter Tuning for Random Forest (example)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print("\nHyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters and performance
best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_
best_roc_auc = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best ROC-AUC Score: {best_roc_auc}")

# Save results to CSV
results_df = pd.DataFrame(results).T
results_df.to_csv("model_performance_metrics.csv", index=True)

print("\nModel performance metrics saved to 'model_performance_metrics.csv'")