In [29]:
from rs_data import get_dataframe, SkillType, evaluate_model

activity = 'Firemaking'
df, formatter = get_dataframe(activity, limit=2500, aggregate=True, skill_type=SkillType.EXPERIENCE)

X = df.drop(columns=['Banned', 'pid'])
y = df['Banned']
df

Unnamed: 0,pid,Banned,updates,activescrapes,inactivescrapes,shortestinactivity,shortestactivity,longestinactivity,longestactivity,Overall_live,...,Vardorvis_aggregate,Venenatis_aggregate,Vet'ion_aggregate,Vorkath_aggregate,Wintertodt_aggregate,Zalcano_aggregate,Zulrah_aggregate,Colosseum Glory_aggregate,Deadman Points_aggregate,League Points_aggregate
0,170698,False,37,28,9,1,1,6,9,2236452139,...,0.0,8.0,0.0,4115.0,6342.0,251.0,3117.0,0,0,0
1,80833,False,71,67,4,1,3,2,53,601621625,...,0.0,0.0,0.0,0.0,5752.0,0.0,0.0,0,0,0
2,3804178,True,19,4,15,7,1,8,1,171624284,...,0.0,0.0,0.0,0.0,577.0,0.0,0.0,0,0,0
3,682311,True,45,40,5,5,0,5,0,252826786,...,0.0,0.0,0.0,0.0,6254.0,0.0,0.0,0,0,0
4,1096481,False,71,70,1,1,0,1,0,687382853,...,0.0,0.0,0.0,0.0,3588.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,339693,False,57,21,36,1,1,11,11,177086291,...,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0,0,0
2496,105584,False,72,67,5,2,1,3,1,319339186,...,0.0,2.0,0.0,0.0,101.0,0.0,0.0,0,0,0
2497,182610,False,72,65,7,1,1,1,23,216491210,...,0.0,0.0,0.0,84.0,287.0,50.0,0.0,0,0,0
2498,75002,False,72,71,1,1,0,1,0,427903760,...,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0,0,0


In [30]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Sampling 
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, KMeansSMOTE
# Models


from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [31]:
from sklearn.compose import ColumnTransformer

standard_features = formatter.agg_skills + formatter.agg_minigames
robust_features = formatter.extra_features #[]    # Assume these have outliers
minmax_features = formatter.live_skills  #[]  # Assume these need scaling between 0 and 1
minmax_features_2 = formatter.live_minigames #[] 

# Creating the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), standard_features),
        ('robust', RobustScaler(), robust_features),
        ('minmax', MinMaxScaler(), minmax_features),
        ('minmax_2', MinMaxScaler(), minmax_features_2),
        
    ])

In [32]:
from imblearn.pipeline import Pipeline as ImblearnPipeline
from rs_data import (PCA, TSNE, UMAP)

# Create an imblearn pipeline with SMOTE
pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # SMOTE applied only during training
    ('PCA', TSNE(n_components=3)),
    ('classifier', ExtraTreesClassifier())
])



In [ ]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Classifier list
classifiers = [
    ("RandomForest", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    ("ExtraTrees", ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42)),  # Does not support n_jobs
    ("SVM", SVC(probability=True, random_state=42)),  # Does not support n_jobs
    ("LogisticRegression", LogisticRegression(random_state=42, n_jobs=-1)),
    ("LGBMClassifier", LGBMClassifier(random_state=42, n_jobs=-1)),
    ("XGBClassifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1))
]

# Creating the preprocessing pipeline
preprocessor = StandardScaler()

# Results DataFrame

rows = []

n_component_list = [6, 15, 35, 50]

for n_c in n_component_list:
    for name, classifier in classifiers:
        # Create the pipeline
        pipeline = ImblearnPipeline([
            ('preprocessor', preprocessor),
            ('smote', SMOTE(random_state=42)),
            ('pca', PCA(n_components=6)),
            ('classifier', classifier)
        ])
    
        # Calculate scores
        accuracy_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
        y_pred_proba  = cross_val_predict(pipeline, X, y, cv=cv)
        
        y_pred = (y_pred_proba >= 0.5).astype(int)
        accuracy_per_class = [
            accuracy_score(y == k, y_pred == k) for k in [0, 1]
        ]
        roc_auc = roc_auc_score(y, y_pred)
        
        
        recall_per_class = recall_score(y, y_pred, average=None)
    
        # Confusion matrix
        conf_matrix = confusion_matrix(y, y_pred)
        
        data = {
            'Classifier': [name],
            'Accuracy': [np.mean(accuracy_scores)],
            'Recall_0': [recall_per_class[0]],
            'Recall_1': [recall_per_class[1]],
            'Accuracy_0': [accuracy_per_class[0]],
            'Accuracy_1': [accuracy_per_class[1]],
            'ROC-AUC': [roc_auc],
            'Matrix': [conf_matrix.tolist()]
        }
        print(data)
        rows.append(
            (name, np.mean(accuracy_scores), 
             recall_per_class[0], recall_per_class[1], 
             accuracy_per_class[0], accuracy_per_class[1], 
             roc_auc, conf_matrix.tolist() )
        )
    

results = pd.DataFrame(rows, columns=['Classifier', 'Accuracy', 'Recall_0', 'Recall_1', 'Accuracy_0', 'Accuracy_1', 'ROC-AUC', 'Matrix' ])
# Print the results sorted by 'Accuracy' and 'Recall'
results.sort_values(by=['Accuracy', 'Recall_1'], ascending=False, inplace=True)
results

{'Classifier': ['RandomForest'], 'Accuracy': [0.8160000000000001], 'Recall_0': [0.8570119156736938], 'Recall_1': [0.5440251572327044], 'Accuracy_0': [0.8172], 'Accuracy_1': [0.8172], 'ROC-AUC': [0.7005185364531991], 'Matrix': [[[1870, 312], [145, 173]]]}
{'Classifier': ['ExtraTrees'], 'Accuracy': [0.828], 'Recall_0': [0.8661778185151238], 'Recall_1': [0.5440251572327044], 'Accuracy_0': [0.8252], 'Accuracy_1': [0.8252], 'ROC-AUC': [0.7051014878739141], 'Matrix': [[[1890, 292], [145, 173]]]}
{'Classifier': ['GradientBoosting'], 'Accuracy': [0.76], 'Recall_0': [0.773602199816682], 'Recall_1': [0.6477987421383647], 'Accuracy_0': [0.7576], 'Accuracy_1': [0.7576], 'ROC-AUC': [0.7107004709775234], 'Matrix': [[[1688, 494], [112, 206]]]}
{'Classifier': ['SVM'], 'Accuracy': [0.732], 'Recall_0': [0.7373968835930339], 'Recall_1': [0.7012578616352201], 'Accuracy_0': [0.7328], 'Accuracy_1': [0.7328], 'ROC-AUC': [0.719327372614127], 'Matrix': [[[1609, 573], [95, 223]]]}
{'Classifier': ['LogisticRegre

from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

# Setup cross-validation scheme
cv = StratifiedKFold(n_splits=5)

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=cv)

# Fit GridSearchCV
grid_search.fit(X, y)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


banned_class_models = [
    ('svm_pca', Pipeline([
        ('pca', PCA(n_components=50)),  # Include PCA as the first step
        ('svm', SVC(probability=True, random_state=42))
    ])),
    ('log_regression_pca', Pipeline([
        ('pca', PCA(n_components=30)),  # Include PCA as the first step
        ('log_reg_algo', LogisticRegression(random_state=42, n_jobs=-1))
    ])),
    ('log_regression_pca', Pipeline([
        ('pca', PCA(n_components=6)),  # Include PCA as the first step
        ('log_reg_algo', LogisticRegression(random_state=42, n_jobs=-1))
    ])),
    
]



unbanned_class_models = [
    ('ExtraTrees_pca', Pipeline([
        ('pca', PCA(n_components=50)),  # Include PCA as the first step
        ('ExtraTrees', ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1))
    ])),
    ('LGBM_pca', Pipeline([
        ('pca', PCA(n_components=30)),  # Include PCA as the first step
        ('LGBM', LGBMClassifier(random_state=42, n_jobs=-1))
    ])),
]

"""
    ('k-means', Pipeline([
        ('preprocessor', preprocessor),
        
        ('PCA', PCA(n_components=50)),
        ('TSNE', TSNE(n_components=2)),
        ('KMeans', KMeans(n_clusters=2))
    ])),
    """

# Define the base models for level 0
level0 = [
    #('gb', GradientBoostingClassifier(random_state=42))
] + banned_class_models # +  unbanned_class_models

# Define the meta model for level 1
level1 = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)





# Define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5, stack_method='predict_proba')

# Create a pipeline with preprocessing and the stacking model
pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # SMOTE applied only during training
    ('stacking', model)
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
# Predictions and evaluation
y_pred = pipeline.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:

# Cross-validation scores
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validated Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))


# SVMSMOTE

Classification Report:
               precision    recall  f1-score   support

       False       0.94      0.98      0.96       443
        True       0.73      0.53      0.61        57

    accuracy                           0.92       500
   macro avg       0.84      0.75      0.79       500
weighted avg       0.92      0.92      0.92       500

Confusion Matrix:
 [[432  11]
 [ 27  30]]

# BorderlineSMOTE

## Random Forest Level 1, 100 trees

Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.97      0.95       443
        True       0.63      0.42      0.51        57

    accuracy                           0.91       500
   macro avg       0.78      0.69      0.73       500
weighted avg       0.89      0.91      0.90       500

Confusion Matrix:
 [[429  14]
 [ 33  24]]


# Default SMOTE

## Random forest Level 1 , 100 trees


Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.97      0.95       443
        True       0.67      0.46      0.54        57

    accuracy                           0.91       500
   macro avg       0.80      0.71      0.75       500
weighted avg       0.90      0.91      0.90       500

Cross-validated Accuracy Scores: [0.8   0.89  0.894 0.894 0.904]
Mean CV Accuracy: 0.8764000000000001

## Logistic_regression

- Worse score, lower recall and accuracy.


## SVC

Classification Report:
               precision    recall  f1-score   support

       False       0.94      0.93      0.93       443
        True       0.48      0.51      0.50        57

    accuracy                           0.88       500
   macro avg       0.71      0.72      0.71       500
weighted avg       0.88      0.88      0.88       500

Confusion Matrix:
 [[412  31]
 [ 28  29]]
 

## ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1)

Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.95      0.94       443
        True       0.53      0.44      0.48        57

    accuracy                           0.89       500
   macro avg       0.73      0.69      0.71       500
weighted avg       0.88      0.89      0.89       500

Confusion Matrix:
 [[421  22]
 [ 32  25]]
