In [1]:
from notebooks.ext_imports import *


activity = 'Firemaking'
user_limit = 2500
skill_type = SkillType.EXPERIENCE

mlflow.set_experiment(f"{activity} {SkillType.EXPERIENCE.description} Model comparison for {user_limit} users with specific preprocessor")  # Set your experiment name

df, formatter = get_dataframe(activity, limit=user_limit, aggregate=True, skill_type=skill_type)

X = df.drop(columns=['Banned', 'pid'])
y = df['Banned']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
standard_features = formatter.agg_skills + formatter.agg_minigames
robust_features = formatter.extra_features
minmax_features = formatter.live_skills
minmax_features_2 = formatter.live_minigames

# Creating the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('std', StandardScaler(), standard_features),
    ('robust', RobustScaler(), robust_features),
    ('minmax', MinMaxScaler(), minmax_features),
    ('minmax_2', MinMaxScaler(), minmax_features_2),
    ])



In [3]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


N_JOBS = 8
# Classifier list
classifiers = [
    ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=N_JOBS)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=N_JOBS)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42)),  # Does not support n_jobs
    ("SVM", SVC(probability=True, random_state=42)),  # Does not support n_jobs
    ("LogisticRegression", LogisticRegression(random_state=42, n_jobs=N_JOBS)),
    ("LGBMClassifier", LGBMClassifier(random_state=42, n_jobs=16)),
    ("XGBClassifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=N_JOBS))
]


# Results DataFrame

rows = []

PCA_COMPONENTS = [
    15,20,30,40
]

for pca_n_components in PCA_COMPONENTS:
    for name, classifier in classifiers:
        with mlflow.start_run():
        
            # Create the pipeline
            pipeline = ImblearnPipeline([
                ('preprocessor', preprocessor),
                ('smote', SVMSMOTE(random_state=42)),
                ('PCA', PCA(n_components=pca_n_components)),
                ('classifier', classifier)
            ])
            
            # Log pipeline components and PCA components
            mlflow.log_param("PCA_n_components", pca_n_components)
            mlflow.log_param("Classifier", name)
            mlflow.log_param("Sampling", "SVMSMOTE")
            
            
        
            # Calculate scores
            accuracy_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
            y_pred_proba  = cross_val_predict(pipeline, X, y, cv=cv)
            
            y_pred = (y_pred_proba >= 0.5).astype(int)
            accuracy_per_class = [
                accuracy_score(y == k, y_pred == k) for k in [0, 1]
            ]
            roc_auc = roc_auc_score(y, y_pred)
            
            
            recall_per_class = recall_score(y, y_pred, average=None)
        
            # Confusion matrix
            conf_matrix = confusion_matrix(y, y_pred)
            
            data = {
                'Classifier': [name],
                'PCA_N_COMPONENTS': pca_n_components,
                'Accuracy': [np.mean(accuracy_scores)],
                'Recall_0': [recall_per_class[0]],
                'Recall_1': [recall_per_class[1]],
                'Accuracy_0': [accuracy_per_class[0]],
                'Accuracy_1': [accuracy_per_class[1]],
                'ROC-AUC': [roc_auc],
                'Matrix': [conf_matrix.tolist()]
            }
            rows.append(
                (name,pca_n_components, np.mean(accuracy_scores), 
                 recall_per_class[0], recall_per_class[1], 
                 accuracy_per_class[0], accuracy_per_class[1], 
                 roc_auc, conf_matrix.tolist() )
            )
            
            
            
            mlflow.log_metric("Mean Accuracy", np.mean(accuracy_scores))
            mlflow.log_metric("ROC-AUC", roc_auc)
            mlflow.log_metric("Recall Class 0", recall_per_class[0])
            mlflow.log_metric("Recall Class 1", recall_per_class[1])
            
            mlflow.log_metric("Accuracy Class 0", accuracy_per_class[0])
            mlflow.log_metric("Accuracy Class 1", accuracy_per_class[1])
            
            
            
            
            df_conf_matrix = pd.DataFrame(conf_matrix, index=["True Neg", "True Pos"], columns=["Pred Neg", "Pred Pos"])
            conf_matrix_file_path = f"confusion_matrix_{name}_pca{pca_n_components}.csv"
            df_conf_matrix.to_csv(conf_matrix_file_path)
            mlflow.log_artifact(conf_matrix_file_path)
            os.remove(conf_matrix_file_path)

            
            
            mlflow.sklearn.log_model(pipeline, "model")

            mlflow.end_run()
    
results = pd.DataFrame(rows, columns=['Classifier', 'PCA_N_COMPONENTS', 'Accuracy', 'Recall_0', 'Recall_1', 'Accuracy_0', 'Accuracy_1', 'ROC-AUC', 'Matrix' ])
# Print the results sorted by 'Accuracy' and 'Recall'
results.sort_values(by=['Accuracy', 'Recall_1'], ascending=False, inplace=True)
results

[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1746, number of negative: 1746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, t

Unnamed: 0,Classifier,PCA_N_COMPONENTS,Accuracy,Recall_0,Recall_1,Accuracy_0,Accuracy_1,ROC-AUC,Matrix
15,ExtraTrees,30,0.884,0.948213,0.487421,0.8896,0.8896,0.717817,"[[2069, 113], [163, 155]]"
22,ExtraTrees,40,0.8832,0.948213,0.474843,0.888,0.888,0.711528,"[[2069, 113], [167, 151]]"
19,LGBMClassifier,30,0.8816,0.929423,0.562893,0.8828,0.8828,0.746158,"[[2028, 154], [139, 179]]"
8,ExtraTrees,20,0.88,0.941338,0.443396,0.878,0.878,0.692367,"[[2054, 128], [177, 141]]"
27,XGBClassifier,40,0.88,0.931714,0.534591,0.8812,0.8812,0.733153,"[[2033, 149], [148, 170]]"
21,RandomForest,40,0.8796,0.931714,0.512579,0.8784,0.8784,0.722146,"[[2033, 149], [155, 163]]"
20,XGBClassifier,30,0.8792,0.927131,0.544025,0.8784,0.8784,0.735578,"[[2023, 159], [145, 173]]"
26,LGBMClassifier,40,0.8788,0.930339,0.544025,0.8812,0.8812,0.737182,"[[2030, 152], [145, 173]]"
1,ExtraTrees,15,0.876,0.935839,0.449686,0.874,0.874,0.692762,"[[2042, 140], [175, 143]]"
14,RandomForest,30,0.874,0.931714,0.522013,0.8796,0.8796,0.726863,"[[2033, 149], [152, 166]]"
