# MCLabs Churn Analyzer - Model Creation

This Jupyter Notebook will create a ML model, train it on our training data, then offer a simple test analysis using test data.

Note that this notebook converts the previous target encoding to a new encoding:
- Not Active (Previously 0) -> Dropped
- Recovered (Previously 1) -> 0
- Churned (Previously 2) -> 1
- Active (Previously 3) -> 2

In [1]:
'''
MODULE/PACKAGE IMPORTS
'''

# System
import os
import re
from glob import glob
from dotenv import load_dotenv
from datetime import datetime, timedelta

# Data
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Custom Modules
from mcalib import McaDataUtils, McaDataPrepare, McaFeaturePipeline, McaTargetPipeline

# Pipelining
import joblib

# Output/Display
from tqdm import tqdm

In [2]:
'''
PRE-MODEL DATA PIPELINE
'''

# Load three data files for model training
df_t1 = McaDataUtils.getDfForTimestamp(timestamp="1758949983.598")
df_t2 = McaDataUtils.getDfForTimestamp(timestamp="1759376248.846")
df_t3 = McaDataUtils.getDfForTimestamp(timestamp="1759635486.595")

# Prepare all datasets
df_t1 = McaDataPrepare.prepareData(df=df_t1, dfTimestamp=1758949983.598)
df_t2 = McaDataPrepare.prepareData(df=df_t2, dfTimestamp=1759376248.846)
df_t3 = McaDataPrepare.prepareData(df=df_t3, dfTimestamp=1759635486.595)

# Perform feature engineering between the first two timestamps
df = McaFeaturePipeline.combineData(currentDf=df_t2, previousDf=df_t1)

# Perform target engineering between the last two timestamps
df = McaTargetPipeline.buildTarget(currentDf=df, futureDf=df_t3, onlyReturnTarget=False)

# Drop UUID's before model
df = McaDataUtils.clearUUIDs(df=df)

# For now, drop rows where target is 0 (completely inactive)
df = df[df["churn"] != 0].reset_index(drop=True)


In [3]:
'''
PIPELINE CREATION

This section will create a pipeline for loading the data, splitting the data, scaling the data, and training the model.
'''

# Separate features from target
MCA_Features = df.drop(columns=["churn"])
MCA_Target = df["churn"]

# Transform target to be 0, 1, 2 instead of 1, 2, 3
MCA_LabelEncoder = LabelEncoder()
MCA_Target =  MCA_LabelEncoder.fit_transform(MCA_Target)

# Split the data
MCA_Features_Train, MCA_Features_Test, MCA_Target_Train, MCA_Target_Test = train_test_split(MCA_Features, MCA_Target, test_size=0.2)

# Identify which features are categorical
categoricalFeatures = ["plan_player_favorite_server"]

# Identify which features are numerical (note we do not include the last seen time here)
numericalFeatures = ["balance","lw_rev_total","lw_rev_phase","leaderboard_position_chems_all","leaderboard_position_chems_week","leaderboard_position_police_all","leaderboard_position_police_week","mcmmo_power_level","mcmmo_skill_ACROBATICS","mcmmo_skill_ALCHEMY","mcmmo_skill_ARCHERY","mcmmo_skill_AXES","mcmmo_skill_CROSSBOWS","mcmmo_skill_EXCAVATION","mcmmo_skill_FISHING","mcmmo_skill_HERBALISM","mcmmo_skill_MACES","mcmmo_skill_MINING","mcmmo_skill_REPAIR","mcmmo_skill_SALVAGE","mcmmo_skill_SMELTING","mcmmo_skill_SWORDS","mcmmo_skill_TAMING","mcmmo_skill_TRIDENTS","mcmmo_skill_UNARMED","mcmmo_skill_WOODCUTTING","chemrank","policerank","donorrank","goldrank","current_month_votes","plan_player_time_total_raw","plan_player_time_month_raw","plan_player_time_week_raw","plan_player_time_day_raw","plan_player_time_afk_raw","plan_player_latest_session_length_raw","plan_player_sessions_count","plan_player_relativePlaytime_totalmonth","plan_player_relativePlaytime_weekmonth","plan_player_relativePlaytime_dayweek","balance_change","lw_rev_total_change","lw_rev_phase_change","leaderboard_position_chems_all_change","leaderboard_position_chems_week_change","leaderboard_position_police_all_change","leaderboard_position_police_week_change","chemrank_change","policerank_change","donorrank_change","goldrank_change"]

# Create preprocessing transformers for encoding and scaling features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categoricalFeatures),
        ("num", StandardScaler(), numericalFeatures)
    ]
)

# Define LogReg pipeline
MCA_Pipeline_LogReg = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs"))
])

# Define XGBoost pipeline
MCA_Pipeline_XGB = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(use_label_encoder=False, eval_metric="logloss", num_class=3, verbosity=0))
])


In [4]:
'''
LOGREG MODEL TRAINING AND TESTING
'''
# Train and test LogReg pipeline
MCA_Pipeline_LogReg.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_LogReg.predict(MCA_Features_Test)
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

Accuracy: 0.8313253012048193
Confusion Matrix:
[[ 0 13]
 [ 1 69]]
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00        13
           2       0.84      0.99      0.91        70

    accuracy                           0.83        83
   macro avg       0.42      0.49      0.45        83
weighted avg       0.71      0.83      0.77        83





In [5]:
'''
XGBOOST MODEL TRAINING AND TESTING

This cell will use a XGBoost pipeline and implement auto hyperparameter tuning to optimize the model's performance.
'''

# Map of hyperparameters and possible values to try tuning XGBoost with
hyperParameterMap = {
    "model__n_estimators": [100, 200, 400],      # boosting rounds
    "model__max_depth": [3, 5, 7],               # tree depth
    "model__learning_rate": [0.01, 0.1, 0.3],    # step size shrinkage
    "model__subsample": [0.8, 1.0],              # row sampling
    "model__colsample_bytree": [0.8, 1.0],       # feature sampling
    "model__scale_pos_weight": [1, 2, 5]         # helps with class imbalance
}

# Grid search for best hyperparameters (5-fold CV)
MCA_Pipeline_GridSearch_XGB = GridSearchCV(
    estimator=MCA_Pipeline_XGB,
    param_grid=hyperParameterMap,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
)

# Train and test XGBoost pipeline
MCA_Pipeline_GridSearch_XGB.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_GridSearch_XGB.predict(MCA_Features_Test)
print(f"Best Parameters: {MCA_Pipeline_GridSearch_XGB.best_params_}")
print(f"Best Cross-Validation Score: {MCA_Pipeline_GridSearch_XGB.best_score_}")
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 100, 'model__scale_pos_weight': 1, 'model__subsample': 0.8}
Best Cross-Validation Score: 0.8598135198135198
Accuracy: 0.8313253012048193
Confusion Matrix:
[[ 2 11]
 [ 3 67]]
Classification Report:
              precision    recall  f1-score   support

           1       0.40      0.15      0.22        13
           2       0.86      0.96      0.91        70

    accuracy                           0.83        83
   macro avg       0.63      0.56      0.56        83
weighted avg       0.79      0.83      0.80        83



In [6]:
'''
PIPELINE SAVING

This section saves the entire machine learning pipeline to a file for future use.
'''

# Save the label encoder
joblib.dump(MCA_LabelEncoder, "../model-internals/MCA_LabelEncoder.pkl")

# Save the entire LogReg pipeline
joblib.dump(MCA_Pipeline_LogReg, "../model-internals/MCA_Pipeline_LogReg.pkl")

# Save the entire XGBoost pipeline
joblib.dump(MCA_Pipeline_GridSearch_XGB, "../model-internals/MCA_Pipeline_GridSearch_XGB.pkl")

['../model-internals/MCA_Pipeline_GridSearch_XGB.pkl']