# MCLabs Churn Analyzer - Model Creation

This Jupyter Notebook will create a ML model, train it on our training data, then offer a simple test analysis using test data.

In [42]:
'''
MODULE/PACKAGE IMPORTS
'''

# System
import os
import re
from glob import glob
from dotenv import load_dotenv
from datetime import datetime, timedelta

# Data
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Pipelining
import joblib

# Output/Display
from tqdm import tqdm

In [None]:
'''
PIPELINE CREATION

This section will create a pipeline for loading the data, splitting the data, scaling the data, and training the model.
'''

# Load the data
modelInputDataframe = pd.read_csv("../data/targetted/public/1757051828.555/targetted.csv")

# Separate features from target
MCA_Features = modelInputDataframe.drop(columns=["churn"])
MCA_Target = modelInputDataframe["churn"]

# Split the data
MCA_Features_Train, MCA_Features_Test, MCA_Target_Train, MCA_Target_Test = train_test_split(MCA_Features, MCA_Target, test_size=0.2)

# Identify which features are categorical
categoricalFeatures = ["plan_player_favorite_server"]

# Identify which features are numerical (note we do not include the last seen time here)
numericalFeatures = ["mcmmo_power_level", "mcmmo_skill_ACROBATICS", "mcmmo_skill_ALCHEMY", "mcmmo_skill_ARCHERY", "mcmmo_skill_AXES", "mcmmo_skill_CROSSBOWS", "mcmmo_skill_EXCAVATION", "mcmmo_skill_FISHING", "mcmmo_skill_HERBALISM", "mcmmo_skill_MACES", "mcmmo_skill_MINING", "mcmmo_skill_REPAIR", "mcmmo_skill_SALVAGE", "mcmmo_skill_SMELTING", "mcmmo_skill_SWORDS", "mcmmo_skill_TAMING", "mcmmo_skill_TRIDENTS", "mcmmo_skill_UNARMED", "mcmmo_skill_WOODCUTTING", "lw_rev_total", "lw_rev_phase", "chemrank", "policerank", "donorrank", "goldrank", "current_month_votes", "plan_player_time_total_raw", "plan_player_time_month_raw", "plan_player_time_week_raw", "plan_player_time_day_raw", "plan_player_time_afk_raw", "plan_player_latest_session_length_raw", "plan_player_sessions_count", "leaderboard_position_chems_all", "leaderboard_position_chems_week", "leaderboard_position_police_all", "leaderboard_position_police_week", "balance", "plan_player_relativePlaytime_totalmonth", "plan_player_relativePlaytime_weekmonth", "plan_player_relativePlaytime_dayweek"]

# Create preprocessing transformers for encoding and scaling features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categoricalFeatures),
        ("num", StandardScaler(), numericalFeatures)
    ]
)

# Define LogReg pipeline
MCA_Pipeline_LogReg = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

# Define XGBoost pipeline
MCA_Pipeline_XGB = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(use_label_encoder=False, eval_metric="logloss", verbosity=0))
])


In [45]:
'''
LOGREG MODEL TRAINING AND TESTING
'''
# Train and test LogReg pipeline
MCA_Pipeline_LogReg.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_LogReg.predict(MCA_Features_Test)
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

ValueError: A given column is not a column of the dataframe

In [39]:
'''
XGBOOST MODEL TRAINING AND TESTING

This cell will use a XGBoost pipeline and implement auto hyperparameter tuning to optimize the model's performance.
'''

# Map of hyperparameters and possible values to try tuning XGBoost with
hyperParameterMap = {
    "model__n_estimators": [100, 200, 400],      # boosting rounds
    "model__max_depth": [3, 5, 7],               # tree depth
    "model__learning_rate": [0.01, 0.1, 0.3],    # step size shrinkage
    "model__subsample": [0.8, 1.0],              # row sampling
    "model__colsample_bytree": [0.8, 1.0],       # feature sampling
    "model__scale_pos_weight": [1, 2, 5]         # helps with class imbalance
}

# Grid search for best hyperparameters (5-fold CV)
MCA_Pipeline_GridSearch_XGB = GridSearchCV(
    estimator=MCA_Pipeline_XGB,
    param_grid=hyperParameterMap,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
)

# Train and test XGBoost pipeline
MCA_Pipeline_GridSearch_XGB.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_GridSearch_XGB.predict(MCA_Features_Test)
print(f"Best Parameters: {MCA_Pipeline_GridSearch_XGB.best_params_}")
print(f"Best Cross-Validation Score: {MCA_Pipeline_GridSearch_XGB.best_score_}")
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 400, 'model__scale_pos_weight': 1, 'model__subsample': 1.0}
Best Cross-Validation Score: 0.9375776149233843
Accuracy: 0.9494097807757167
Confusion Matrix:
[[ 62  29]
 [  1 501]]
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.68      0.81        91
           1       0.95      1.00      0.97       502

    accuracy                           0.95       593
   macro avg       0.96      0.84      0.89       593
weighted avg       0.95      0.95      0.95       593



In [41]:
'''
PIPELINE SAVING

This section saves the entire machine learning pipeline to a file for future use.
'''

# Save the entire LogReg pipeline
joblib.dump(MCA_Pipeline_LogReg, "../model-internals/MCA_Pipeline_LogReg.pkl")

# Save the entire XGBoost pipeline
joblib.dump(MCA_Pipeline_GridSearch_XGB, "../model-internals/MCA_Pipeline_GridSearch_XGB.pkl")

['../model-internals/MCA_Pipeline_GridSearch_XGB.pkl']