In [6]:
from rs_data import get_dataframe, SkillType, evaluate_model

activity = 'Firemaking'
df, formatter = get_dataframe(activity, limit=2500, aggregate=True, skill_type=SkillType.EXPERIENCE)

X = df.drop(columns=['Banned', 'pid'])
y = df['Banned']
df

Unnamed: 0,pid,Banned,updates,activescrapes,inactivescrapes,shortestinactivity,shortestactivity,longestinactivity,longestactivity,Overall_live,...,Vardorvis_aggregate,Venenatis_aggregate,Vet'ion_aggregate,Vorkath_aggregate,Wintertodt_aggregate,Zalcano_aggregate,Zulrah_aggregate,Colosseum Glory_aggregate,Deadman Points_aggregate,League Points_aggregate
0,170698,False,37,28,9,1,1,6,9,2236452139,...,0.0,8.0,0.0,4115.0,6342.0,251.0,3117.0,0,0,0
1,80833,False,71,67,4,1,3,2,53,601621625,...,0.0,0.0,0.0,0.0,5752.0,0.0,0.0,0,0,0
2,3804178,True,19,4,15,7,1,8,1,171624284,...,0.0,0.0,0.0,0.0,577.0,0.0,0.0,0,0,0
3,682311,True,45,40,5,5,0,5,0,252826786,...,0.0,0.0,0.0,0.0,6254.0,0.0,0.0,0,0,0
4,1096481,False,71,70,1,1,0,1,0,687382853,...,0.0,0.0,0.0,0.0,3588.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,339693,False,57,21,36,1,1,11,11,177086291,...,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0,0,0
2496,105584,False,72,67,5,2,1,3,1,319339186,...,0.0,2.0,0.0,0.0,101.0,0.0,0.0,0,0,0
2497,182610,False,72,65,7,1,1,1,23,216491210,...,0.0,0.0,0.0,84.0,287.0,50.0,0.0,0,0,0
2498,75002,False,72,71,1,1,0,1,0,427903760,...,0.0,0.0,0.0,0.0,53.0,0.0,0.0,0,0,0


In [7]:
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# Sampling 
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, SVMSMOTE, KMeansSMOTE
#from rs_data import SMOTE
# Models


from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import recall_score, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [8]:
from sklearn.compose import ColumnTransformer

standard_features = formatter.agg_skills + formatter.agg_minigames
robust_features = formatter.extra_features #[]    # Assume these have outliers
minmax_features = formatter.live_skills  #[]  # Assume these need scaling between 0 and 1
minmax_features_2 = formatter.live_minigames #[] 

# Creating the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('std', StandardScaler(), standard_features),
        ('robust', RobustScaler(), robust_features),
        ('minmax', MinMaxScaler(), minmax_features),
        ('minmax_2', MinMaxScaler(), minmax_features_2),
        
    ])

In [9]:
from imblearn.pipeline import Pipeline as ImblearnPipeline
from rs_data import (PCA, TSNE, UMAP)

# Create an imblearn pipeline with SMOTE
pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),  # SMOTE applied only during training
    ('PCA', TSNE(n_components=3)),
    ('classifier', ExtraTreesClassifier())
])



from sklearn.model_selection import GridSearchCV, StratifiedKFold

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15]
}

# Setup cross-validation scheme
cv = StratifiedKFold(n_splits=5)

# Setup the GridSearchCV
grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=cv)

# Fit GridSearchCV
grid_search.fit(X, y)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validated score:", grid_search.best_score_)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier
from sklearn.cluster import KMeans

# Define the base models for level 0
level0 = [
    ('ExtraTreesClassifier', Pipeline([
        ('ExtraTrees', ExtraTreesClassifier(n_estimators=100, random_state=42, n_jobs=-1))
    ])),
    ('LogisticRegression', Pipeline([
        ('ExtraTrees', LogisticRegression(random_state=42, n_jobs=-1))
    ])),
]

# Define the meta model for level 1
#level1 = RandomForestClassifier(n_estimators=25, random_state=42, n_jobs=-1)
level1 = LogisticRegression(random_state=42, n_jobs=-1)

# Define the stacking ensemble
model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

# Create a pipeline with preprocessing and the stacking model
pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('pca', PCA(n_components=30)),
    ('stacking', model)
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipeline, X, y, cv=cv)
print("Classification Report:\n", classification_report(y, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred))

Classification Report:
               precision    recall  f1-score   support

       False       0.92      0.96      0.94      2182
        True       0.59      0.42      0.49       318

    accuracy                           0.89      2500
   macro avg       0.76      0.69      0.72      2500
weighted avg       0.88      0.89      0.88      2500

Confusion Matrix:
 [[2090   92]
 [ 184  134]]
