In [1]:
from .ext_imports import *
activity = 'Firemaking'
df, formatter = get_dataframe(activity, limit=2500, aggregate=True, skill_type=SkillType.LEVELS)

X = df.drop(columns=['Banned', 'pid'])
y = df['Banned']

In [4]:
from sklearn.compose import ColumnTransformer

standard_features = formatter.agg_skills + formatter.agg_minigames
robust_features = formatter.extra_features #[]    # Assume these have outliers
minmax_features = formatter.live_skills  #[]  # Assume these need scaling between 0 and 1
minmax_features_2 = formatter.live_minigames #[] 

# Creating the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), standard_features),
        ('robust', RobustScaler(), robust_features),
        ('minmax', MinMaxScaler(), minmax_features),
        ('minmax_2', MinMaxScaler(), minmax_features_2),
        
    ])

In [5]:
from imblearn.pipeline import Pipeline as ImblearnPipeline
from rs_data import (PCA, TSNE, UMAP)

# Create an imblearn pipeline with SMOTE
pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # SMOTE applied only during training
    ('PCA', TSNE(n_components=3)),
    ('classifier', ExtraTreesClassifier())
])



In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Classifier list
classifiers = [
    ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42)),  # Does not support n_jobs
    ("SVM", SVC(probability=True, random_state=42)),  # Does not support n_jobs
    ("LogisticRegression", LogisticRegression(random_state=42, n_jobs=-1)),
    ("LGBMClassifier", LGBMClassifier(random_state=42, n_jobs=-1)),
    ("XGBClassifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1))
]

# Creating the preprocessing pipeline
preprocessor = StandardScaler()

# Results DataFrame

rows = []

pca_components = [
    2, 10, 30, 50
]

tsne_perplexities = [
    30
]

for tsne_perplexity in tsne_perplexities:
    for pca_n_components in pca_components:
        for name, classifier in classifiers:
            # Create the pipeline
            pipeline = ImblearnPipeline([
                ('preprocessor', preprocessor),
                ('smote', SMOTE(random_state=42)),
                ('PCA', PCA(n_components=pca_n_components)),
                ('TSNE', TSNE(n_components=2,perplexity=tsne_perplexity)),
                ('classifier', classifier)
            ])
        
            # Calculate scores
            accuracy_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
            y_pred_proba  = cross_val_predict(pipeline, X, y, cv=cv)
            
            y_pred = (y_pred_proba >= 0.5).astype(int)
            accuracy_per_class = [
                accuracy_score(y == k, y_pred == k) for k in [0, 1]
            ]
            roc_auc = roc_auc_score(y, y_pred)
            
            
            recall_per_class = recall_score(y, y_pred, average=None)
        
            # Confusion matrix
            conf_matrix = confusion_matrix(y, y_pred)
            
            data = {
                'Classifier': [name],
                'PCA_N_COMPONENTS': pca_n_components,
                'TSNE_Perplexity': tsne_perplexity,
                'Accuracy': [np.mean(accuracy_scores)],
                'Recall_0': [recall_per_class[0]],
                'Recall_1': [recall_per_class[1]],
                'Accuracy_0': [accuracy_per_class[0]],
                'Accuracy_1': [accuracy_per_class[1]],
                'ROC-AUC': [roc_auc],
                'Matrix': [conf_matrix.tolist()]
            }
            rows.append(
                (name,pca_n_components,tsne_perplexities, np.mean(accuracy_scores), 
                 recall_per_class[0], recall_per_class[1], 
                 accuracy_per_class[0], accuracy_per_class[1], 
                 roc_auc, conf_matrix.tolist() )
            )
    
results = pd.DataFrame(rows, columns=['Classifier', 'PCA_N_COMPONENTS', 'TSNE_Perplexity', 'Accuracy', 'Recall_0', 'Recall_1', 'Accuracy_0', 'Accuracy_1', 'ROC-AUC', 'Matrix' ])
# Print the results sorted by 'Accuracy' and 'Recall'
results.sort_values(by=['Accuracy', 'Recall_1'], ascending=False, inplace=True)
results

[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.087353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 1745, number of negative: 1745
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098336 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 3490, number of used features: 2
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


# Stacked models


- High performance for recall and accuracy on class 1 (but performs well on both)
    - SVM,50
    - LogisticRegression,30

- High performance for recall and accuracy on class 0
    - ExtraTrees,50
    - LGBMClassifier,30

---

# Level 0 Stacked Models

| Classifier         | PCA_N_COMPONENTS | Accuracy | Recall_0 | Recall_1 | Accuracy_0 | Accuracy_1 | ROC-AUC  | Matrix                         |
|--------------------|------------------|----------|----------|----------|------------|------------|----------|--------------------------------|
| SVM                | 50               | 0.8048   | 0.815765 | 0.764151 | 0.8092     | 0.8092     | 0.789958 | [[1780, 402], [75, 243]]       |
| LogisticRegression | 30               | 0.7876   | 0.805225 | 0.710692 | 0.7932     | 0.7932     | 0.757958 | [[1757, 425], [92, 226]]       |
| ExtraTrees         | 50               | 0.8732   | 0.942713 | 0.396226 | 0.8732     | 0.8732     | 0.669470 | [[2057, 125], [192, 126]]      |
| LGBMClassifier     | 30               | 0.8548   | 0.890926 | 0.559748 | 0.8488     | 0.8488     | 0.725337 | [[1944, 238], [140, 178]]      |

