In [5]:
from notebooks.ext_imports import *

from rs_data.database.rs_processing import Leaderboards

activities = Leaderboards.get_skill_names(keep_overall=True)

activity = 'Firemaking'
user_limit = 2500
skill_type = SkillType.EXPERIENCE

mlflow.set_experiment(f"{activity} {SkillType.EXPERIENCE.description} Model comparison for {user_limit} users")  # Set your experiment name

df, formatter = get_dataframe(activity, limit=user_limit, aggregate=True, skill_type=skill_type)


# Creating the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
        ('std', StandardScaler(), formatter.agg_skills + formatter.agg_minigames),
    ('robust', RobustScaler(), formatter.extra_features),
    ('minmax', MinMaxScaler(), formatter.live_skills),
    ('minmax_2', MinMaxScaler(), formatter.live_minigames),
    ])

X = df.drop(columns=['Banned', 'pid'])
y = df['Banned']

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

N_JOBS = 16
# Classifier list
classifiers = [
    ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=N_JOBS)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=N_JOBS)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42)),  # Does not support n_jobs
    ("SVM", SVC(probability=True, random_state=42)),  # Does not support n_jobs
    ("LogisticRegression", LogisticRegression(random_state=42, n_jobs=N_JOBS)),
    ("LGBMClassifier", LGBMClassifier(random_state=42, n_jobs=16)),
    ("XGBClassifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=N_JOBS))
]

# Results DataFrame

rows = []

PCA_COMPONENTS = [
    6, 10, 20, 30, 40, 50
]

for pca_n_components in PCA_COMPONENTS:
    for name, classifier in classifiers:
        with mlflow.start_run():
        
            # Create the pipeline
            pipeline = ImblearnPipeline([
                ('preprocessor', preprocessor),
                ('smote', SMOTE(random_state=42)),
                ('PCA', PCA(n_components=pca_n_components)),
                ('classifier', classifier)
            ])
            
            # Log pipeline components and PCA components
            mlflow.log_param("PCA_n_components", pca_n_components)
            mlflow.log_param("Classifier", name)
            mlflow.log_param("Sampling", "SMOTE")
            
            
        
            # Calculate scores
            accuracy_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
            y_pred_proba  = cross_val_predict(pipeline, X, y, cv=cv)
            
            y_pred = (y_pred_proba >= 0.5).astype(int)
            accuracy_per_class = [
                accuracy_score(y == k, y_pred == k) for k in [0, 1]
            ]
            roc_auc = roc_auc_score(y, y_pred)
            
            
            recall_per_class = recall_score(y, y_pred, average=None)
        
            # Confusion matrix
            conf_matrix = confusion_matrix(y, y_pred)
            
            data = {
                'Classifier': [name],
                'PCA_N_COMPONENTS': pca_n_components,
                'Accuracy': [np.mean(accuracy_scores)],
                'Recall_0': [recall_per_class[0]],
                'Recall_1': [recall_per_class[1]],
                'Accuracy_0': [accuracy_per_class[0]],
                'Accuracy_1': [accuracy_per_class[1]],
                'ROC-AUC': [roc_auc],
                'Matrix': [conf_matrix.tolist()]
            }
            rows.append(
                (name,pca_n_components, np.mean(accuracy_scores), 
                 recall_per_class[0], recall_per_class[1], 
                 accuracy_per_class[0], accuracy_per_class[1], 
                 roc_auc, conf_matrix.tolist() )
            )
            
            
            
            mlflow.log_metric("Mean Accuracy", np.mean(accuracy_scores))
            mlflow.log_metric("ROC-AUC", roc_auc)
            mlflow.log_metric("Recall Class 0", recall_per_class[0])
            mlflow.log_metric("Recall Class 1", recall_per_class[1])
            
            mlflow.log_metric("Accuracy Class 0", accuracy_per_class[0])
            mlflow.log_metric("Accuracy Class 1", accuracy_per_class[1])
            
            
            
            
            df_conf_matrix = pd.DataFrame(conf_matrix, index=["True Neg", "True Pos"], columns=["Pred Neg", "Pred Pos"])
            conf_matrix_file_path = f"confusion_matrix_{name}_pca{pca_n_components}.csv"
            df_conf_matrix.to_csv(conf_matrix_file_path)
            mlflow.log_artifact(conf_matrix_file_path)
            os.remove(conf_matrix_file_path)

            
            
            mlflow.sklearn.log_model(pipeline, "model")

            mlflow.end_run()
    
results = pd.DataFrame(rows, columns=['Classifier', 'PCA_N_COMPONENTS', 'Accuracy', 'Recall_0', 'Recall_1', 'Accuracy_0', 'Accuracy_1', 'ROC-AUC', 'Matrix' ])
# Print the results sorted by 'Accuracy' and 'Recall'
results.sort_values(by=['Accuracy', 'Recall_1'], ascending=False, inplace=True)
results

[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1746, number of negative: 1746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the



[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.080717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1746, number of negative