# MCLabs Churn Analyzer - Model Creation

This Jupyter Notebook will create a ML model, train it on our training data, then offer a simple test analysis using test data.

In [1]:
'''
MODULE/PACKAGE IMPORTS
'''

# System
import os
import re
from glob import glob
from dotenv import load_dotenv
from datetime import datetime, timedelta

# Data
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Pipelining
import joblib

# Output/Display
from tqdm import tqdm

In [4]:
'''
PIPELINE CREATION

This section will create a pipeline for loading the data, splitting the data, scaling the data, and training the model.
'''

# Load the data
modelInputDataframe = pd.read_csv("../data/combined/public/PlayerData_SeptemberTraining.csv")

# Separate features from target
MCA_Features = modelInputDataframe.drop(columns=["churn"])
MCA_Target = modelInputDataframe["churn"]

# Split the data
MCA_Features_Train, MCA_Features_Test, MCA_Target_Train, MCA_Target_Test = train_test_split(MCA_Features, MCA_Target, test_size=0.2)

# Identify which features are categorical
categoricalFeatures = ["plan_player_favorite_server_t1", "plan_player_favorite_server_t2"]

# Identify which features are numerical (note we do not include the last seen time here)
numericalFeatures = ["balance_t1","lw_rev_total_t1","lw_rev_phase_t1","leaderboard_position_chems_all_t1","leaderboard_position_chems_week_t1","leaderboard_position_police_all_t1","leaderboard_position_police_week_t1","mcmmo_power_level_t1","mcmmo_skill_ACROBATICS_t1","mcmmo_skill_ALCHEMY_t1","mcmmo_skill_ARCHERY_t1","mcmmo_skill_AXES_t1","mcmmo_skill_CROSSBOWS_t1","mcmmo_skill_EXCAVATION_t1","mcmmo_skill_FISHING_t1","mcmmo_skill_HERBALISM_t1","mcmmo_skill_MACES_t1","mcmmo_skill_MINING_t1","mcmmo_skill_REPAIR_t1","mcmmo_skill_SALVAGE_t1","mcmmo_skill_SMELTING_t1","mcmmo_skill_SWORDS_t1","mcmmo_skill_TAMING_t1","mcmmo_skill_TRIDENTS_t1","mcmmo_skill_UNARMED_t1","mcmmo_skill_WOODCUTTING_t1","chemrank_t1","policerank_t1","donorrank_t1","goldrank_t1","current_month_votes_t1","plan_player_time_total_raw_t1","plan_player_time_month_raw_t1","plan_player_time_week_raw_t1","plan_player_time_day_raw_t1","plan_player_time_afk_raw_t1","plan_player_latest_session_length_raw_t1","plan_player_sessions_count_t1","plan_player_relativePlaytime_totalmonth_t1","plan_player_relativePlaytime_weekmonth_t1","plan_player_relativePlaytime_dayweek_t1","balance_t2","lw_rev_total_t2","lw_rev_phase_t2","leaderboard_position_chems_all_t2","leaderboard_position_chems_week_t2","leaderboard_position_police_all_t2","leaderboard_position_police_week_t2","mcmmo_power_level_t2","mcmmo_skill_ACROBATICS_t2","mcmmo_skill_ALCHEMY_t2","mcmmo_skill_ARCHERY_t2","mcmmo_skill_AXES_t2","mcmmo_skill_CROSSBOWS_t2","mcmmo_skill_EXCAVATION_t2","mcmmo_skill_FISHING_t2","mcmmo_skill_HERBALISM_t2","mcmmo_skill_MACES_t2","mcmmo_skill_MINING_t2","mcmmo_skill_REPAIR_t2","mcmmo_skill_SALVAGE_t2","mcmmo_skill_SMELTING_t2","mcmmo_skill_SWORDS_t2","mcmmo_skill_TAMING_t2","mcmmo_skill_TRIDENTS_t2","mcmmo_skill_UNARMED_t2","mcmmo_skill_WOODCUTTING_t2","chemrank_t2","policerank_t2","donorrank_t2","goldrank_t2","current_month_votes_t2","plan_player_time_total_raw_t2","plan_player_time_month_raw_t2","plan_player_time_week_raw_t2","plan_player_time_day_raw_t2","plan_player_time_afk_raw_t2","plan_player_latest_session_length_raw_t2","plan_player_sessions_count_t2","plan_player_relativePlaytime_totalmonth_t2","plan_player_relativePlaytime_weekmonth_t2","plan_player_relativePlaytime_dayweek_t2","balance_change","lw_rev_total_change","lw_rev_phase_change","leaderboard_position_chems_all_change","leaderboard_position_chems_week_change","leaderboard_position_police_all_change","leaderboard_position_police_week_change","chemrank_change","policerank_change","donorrank_change","goldrank_change"]

# Create preprocessing transformers for encoding and scaling features
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categoricalFeatures),
        ("num", StandardScaler(), numericalFeatures)
    ]
)

# Define LogReg pipeline
MCA_Pipeline_LogReg = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000, multi_class="multinomial", solver="lbfgs"))
])

# Define XGBoost pipeline
MCA_Pipeline_XGB = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBClassifier(use_label_encoder=False, eval_metric="logloss", num_class=4, verbosity=0))
])


In [5]:
'''
LOGREG MODEL TRAINING AND TESTING
'''
# Train and test LogReg pipeline
MCA_Pipeline_LogReg.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_LogReg.predict(MCA_Features_Test)
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

Accuracy: 0.901743264659271
Confusion Matrix:
[[520   1   6   0]
 [  4   3   0   0]
 [ 43   0  31   0]
 [  1   4   3  15]]
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95       527
           1       0.38      0.43      0.40         7
           2       0.78      0.42      0.54        74
           3       1.00      0.65      0.79        23

    accuracy                           0.90       631
   macro avg       0.77      0.62      0.67       631
weighted avg       0.90      0.90      0.89       631





In [6]:
'''
XGBOOST MODEL TRAINING AND TESTING

This cell will use a XGBoost pipeline and implement auto hyperparameter tuning to optimize the model's performance.
'''

# Map of hyperparameters and possible values to try tuning XGBoost with
hyperParameterMap = {
    "model__n_estimators": [100, 200, 400],      # boosting rounds
    "model__max_depth": [3, 5, 7],               # tree depth
    "model__learning_rate": [0.01, 0.1, 0.3],    # step size shrinkage
    "model__subsample": [0.8, 1.0],              # row sampling
    "model__colsample_bytree": [0.8, 1.0],       # feature sampling
    "model__scale_pos_weight": [1, 2, 5]         # helps with class imbalance
}

# Grid search for best hyperparameters (5-fold CV)
MCA_Pipeline_GridSearch_XGB = GridSearchCV(
    estimator=MCA_Pipeline_XGB,
    param_grid=hyperParameterMap,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
)

# Train and test XGBoost pipeline
MCA_Pipeline_GridSearch_XGB.fit(MCA_Features_Train, MCA_Target_Train)
MCA_Target_Pred = MCA_Pipeline_GridSearch_XGB.predict(MCA_Features_Test)
print(f"Best Parameters: {MCA_Pipeline_GridSearch_XGB.best_params_}")
print(f"Best Cross-Validation Score: {MCA_Pipeline_GridSearch_XGB.best_score_}")
print(f"Accuracy: {accuracy_score(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(MCA_Target_Test, MCA_Target_Pred)}")
print(f"Classification Report:\n{classification_report(MCA_Target_Test, MCA_Target_Pred)}")

Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__n_estimators': 200, 'model__scale_pos_weight': 1, 'model__subsample': 0.8}
Best Cross-Validation Score: 0.9334417727487034
Accuracy: 0.9175911251980983
Confusion Matrix:
[[518   9   0   0]
 [  0   5   1   1]
 [ 32   0  39   3]
 [  0   3   3  17]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       527
           1       0.29      0.71      0.42         7
           2       0.91      0.53      0.67        74
           3       0.81      0.74      0.77        23

    accuracy                           0.92       631
   macro avg       0.74      0.74      0.70       631
weighted avg       0.93      0.92      0.91       631



In [7]:
'''
PIPELINE SAVING

This section saves the entire machine learning pipeline to a file for future use.
'''

# Save the entire LogReg pipeline
joblib.dump(MCA_Pipeline_LogReg, "../model-internals/MCA_Pipeline_LogReg.pkl")

# Save the entire XGBoost pipeline
joblib.dump(MCA_Pipeline_GridSearch_XGB, "../model-internals/MCA_Pipeline_GridSearch_XGB.pkl")

['../model-internals/MCA_Pipeline_GridSearch_XGB.pkl']