## Setup

In [32]:
### SETUP ###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import random
from plyer import notification
import matplotlib.pyplot as plt

pd.set_option('future.no_silent_downcasting', True)

data = joblib.load("../data/pkls/rgl_df_dict.pkl")

# Load necessary data

players = data['players']
teams = data['teams']

players['suicide_rate'] = players['suicides'].div(players['deaths']).astype(float).round(4)

team_medic_stats = data['team_medic_stats']
info = data['info']

## Data Manips
- Detect if the teams follow the medic,2 scout, 2 soldier, 1 demo

In [33]:
team_comp = players.groupby(['id', 'team'])['primary_class'].agg(lambda x: ".".join(x)).reset_index(name='class_concat')

team_comps = (team_comp['class_concat'].str.split("."))

# Test if team_comp is correct
correct = []
for team in team_comps:
    if len(team) != 6:
        correct.append(0)
        continue
    demoman = 0
    soldier = 0 
    scout = 0
    medic = 0
    for class_name in team:
        if class_name == 'demoman':
            demoman += 1
        if class_name == 'soldier':
            soldier += 1
        if class_name == 'scout':
            scout += 1
        if class_name == 'medic':
            medic += 1
    if demoman == 1 and soldier == 2 and scout == 2 and medic == 1:
        correct.append(1)
    else:
        correct.append(0)

team_comp['correct'] = correct

team_comp = team_comp.groupby('id').agg(correct_team_comp = ('correct','sum'))

team_comp = team_comp[team_comp['correct_team_comp'] == 2]

players = players[players['id'].isin(team_comp.reset_index()['id'])]

- Remove short matches

In [34]:
short_matches = info[info['length'] < 450]

#players = players[~players['id'].isin(short_matches['id'])]

- Remove non-valid maps

In [35]:
## Check if map is valid
# If the map doesnt have at least 50 plays, not valid.
# Sometimes people just upload as "sunshine" need to ensure there are only single maps

# Find valid map names
play_count = 50
info_correct = info[info['id'].isin(players['id'])].copy()
maps = info_correct['map'].str.lower().str.split("_")
map_counts = pd.Series(maps.str[1].value_counts())

valid_maps = map_counts[map_counts > play_count]
valid_map_names = valid_maps.index

# Grab the first and second word of the mapname
first_map_word_length = maps.str[0].str.split(" ").apply(len)
first_map_word = maps.str[0].str.split(" ").str[0]
second_map_word = maps.str[1]

first_map_length_check = first_map_word_length == 1
first_map_check = first_map_word.isin(valid_map_names)
second_map_check = second_map_word.isin(valid_map_names)

info_correct['map_check'] = (first_map_length_check & first_map_check) | (second_map_check)

correct_map = info_correct[info_correct['map_check'] == True].copy()

#players = players[players['id'].isin(correct_map['id'])]

- Map name

In [36]:
map_list = []
for map in correct_map['map'].str.lower().values:
    for map_name in valid_map_names:
        if map_name in map:
            map_list.append(map_name)

correct_map['map_name'] = map_list

In [37]:
map_list = []
for map in correct_map['map'].str.lower().values:
    for map_name in valid_map_names:
        if map_name in map:
            map_list.append(map_name)

correct_map['map_name'] = map_list

- Fix class_names to have 1's and 2's

In [38]:
import random
# Copy the DataFrame so we don't overwrite the original
players_fixed = players.copy()

# Group by match id and team
grouped = players_fixed.groupby(['id', 'team'])

# Function to randomly rename duplicate classes within each group
def rename_classes_randomly(df):
    random.seed(123)
    np.random.seed(123)
    df = df.copy()  # avoid SettingWithCopyWarning
    for cls in ['scout', 'soldier']:
        indices = df.index[df['primary_class'] == cls].tolist()
        if len(indices) == 2:
            # Randomly shuffle the suffixes
            suffixes = [f"{cls}_1", f"{cls}_2"]
            random.shuffle(suffixes)
            for i, idx in enumerate(indices):
                df.at[idx, 'primary_class'] = suffixes[i]
    return df

# Apply function to each group
players_fixed = grouped.apply(rename_classes_randomly, include_groups=False).reset_index()

# Drop the redundant index column
players_fixed.drop('level_2', axis=1, inplace=True)


- Drop Appropriate Columns

In [39]:
drop_cols = [ 'primary_class_time', 'name',
       'assists', 'cpc','heal', 'hr','deaths', 'dmg', 'dmg_real', 'drops',
       'dt', 'dt_real','kills','medkits','medkits_hp','sentries', 
       'suicides','ka','offclass_time','total_time','kapd','ka','ka_pct','hroi',"dmg_real_pct",
       "dmg_pct",'suicide_rate']

players_fixed.drop(drop_cols,axis =1,inplace = True)

- Make the combat_players and medic_players datasets

In [40]:
combat_classes = ['scout', 'soldier', 'demoman']
pattern = '|'.join(combat_classes)  # Creates 'scout|soldier|demoman'
combat_players = players_fixed[players_fixed['primary_class'].str.contains(pattern, case=False, na=False)].copy()

# Make medic stats
medic_players = players_fixed[players_fixed['primary_class'] == 'medic'].copy()

# Bind in team_medic
medic_players = medic_players.merge(team_medic_stats,on= ['id','team'])


# Drop Bad columns
drop_medic = ['offclass_pct','hroi_real','hr_pct',
              'medicstats.advantages_lost','medicstats.deaths_with_95_99_uber',
       'medicstats.deaths_within_20s_after_uber', 'ubers', 'drops',
       'medic_deaths', 'exchanges_initiated', 'drops_forced',
       'successful_ubers', 'medic_deaths_forced', 'exchanges_not_initiated',
       'successful_uber_rate', 'forced_medic_death_rate', 'forced_drop_rate',
       'medic_deaths_capitalized', 'round_losing_medic_deaths',
       'round_losing_medic_death_rate', 'medic_death_capitalization_rate',
       'advantages_lost_per_round']

drop_combat = ['healps']

combat_players.drop(drop_combat,axis = 1,inplace = True)

medic_players.drop(drop_medic,axis = 1,inplace = True)

# Make columns numeric and fillna with 0
non_numeric_columns = ['id', 'team', 'primary_class','steamid']
for df in [medic_players,combat_players]:
       for col in df.columns:
              if col in non_numeric_columns:
                     continue
              df[col] = pd.to_numeric(df[col])
       df.fillna(0,inplace=True)

# Remove medicstats. from colnames

medic_players.columns = [col.replace("medicstats.","") for col in medic_players.columns]
       

- Widen the datasets

In [41]:
index_columns = ['id', 'team', 'primary_class']
combat_wide = (
    combat_players
    .set_index(index_columns)  # MultiIndex
    .unstack('primary_class')                    # Pivot on class
)

medic_wide = (
    medic_players
    .set_index(index_columns)  # MultiIndex
    .unstack('primary_class')                        # Pivot on class
)

# Step 3: Flatten the MultiIndex column names
combat_wide.columns = [f"{cls}_{stat}" for stat, cls in combat_wide.columns]

# Step 4: Reset index
combat_wide = combat_wide.reset_index()

# Drop non-scout offclass 
cols = [col for col in combat_wide.columns if 'offclass' in col and 'scout' not in col]
combat_wide = combat_wide.drop(cols,axis = 1)

# Step 3: Flatten the MultiIndex column names
medic_wide.columns = [f"{cls}_{stat}" for stat, cls in medic_wide.columns]

# Step 4: Reset index
medic_wide = medic_wide.reset_index()



- Merge Combat, Medic, Mapname and Winner

In [42]:
# Remove duplicate columns
medic_merger = medic_wide.drop(['id','team'],axis =1)

# Merge medic and combat
players_wide = pd.concat([combat_wide,medic_merger],axis = 1)

players_wide = players_wide.merge(teams[['id','team','winner']],on =['id','team'])

- Make Model Datasets

In [43]:
drop_cols = ['id','team','winner'] + [col for col in players_wide if 'steamid' in col]

X = players_wide.drop(drop_cols,axis = 1).copy()

y = players_wide['winner']

- Rank Normalize Data

In [44]:
# Rank the scout and soldier data based on the entire dataset, not the pivot version
scout_soldier = X[[col for col in X.columns if 'scout' in col or 'soldier' in col]].copy()

# Turn the data into long
scout_soldier_long = pd.DataFrame()
for index in ['1','2']:
    df = scout_soldier[[col for col in scout_soldier.columns if index in col]].copy()
    df.columns = [col.replace("_"+index,"") for col in df.columns]
    df['num']= index
    scout_soldier_long = pd.concat([scout_soldier_long,df])

# Drop index and rank
num = scout_soldier_long['num']
scout_soldier_long.drop("num",axis =1,inplace = True)
ranked_scout_soldier = scout_soldier_long.rank(pct=True)

# Re attach index
ranked_scout_soldier['num'] = num

# Widen the datset again
scout_soldier = pd.DataFrame()
for index in ['1','2']:
    df = ranked_scout_soldier[ranked_scout_soldier['num'] == index].copy()
    df.drop('num',axis = 1,inplace = True)
    df.columns = [col + "_" + index for col in df.columns]
    scout_soldier = pd.concat([scout_soldier,df],axis = 1)


# Rank the medic and demo stats
medic_demo = X[[col for col in X.columns if 'scout' not in col and 'soldier' not in col]].copy()

medic_demo = medic_demo.rank(pct=True)

# Merge data back together

X = pd.concat([scout_soldier,medic_demo],axis = 1)

In [45]:
# Map name
team_maps = players_wide.merge(correct_map[['id','map_name']],on = 'id')['map_name']
map_dummies = pd.get_dummies(team_maps)
X = pd.concat([X,map_dummies],axis =1 )
X = X.astype({col: bool for col in X.select_dtypes(include='object').columns})


In [46]:
corr_mat = X.corr().abs()

for i in corr_mat.index:
    for j in corr_mat.columns:
        if i == j:
            continue
        corr = corr_mat.loc[i,j]
        if corr > .75:
            s = f'{i} x {j}'
            print(s)

scout_dapd_1 x scout_kpd_1
soldier_dapd_1 x soldier_kpd_1
scout_kpd_1 x scout_dapd_1
soldier_kpd_1 x soldier_dapd_1
scout_dapd_2 x scout_kpd_2
soldier_dapd_2 x soldier_kpd_2
scout_kpd_2 x scout_dapd_2
soldier_kpd_2 x soldier_dapd_2
demoman_dapd x demoman_kpd
demoman_kpd x demoman_dapd
medic_dapd x medic_dapm
medic_dapm x medic_dapd
medic_kpd x medic_kill_pct
medic_kill_pct x medic_kpd


In [125]:
X

Unnamed: 0,scout_dapd_1,soldier_dapd_1,scout_dapm_1,soldier_dapm_1,scout_kpd_1,soldier_kpd_1,scout_offclass_pct_1,scout_kill_pct_1,soldier_kill_pct_1,scout_deaths_pct_1,...,bagel,clearcut,gullywash,metalworks,process,product,snakewater,sunshine,villa,id
0,0.967798,0.966551,0.968767,0.729501,0.985180,0.971053,0.182548,0.935180,0.267659,0.263296,...,False,False,True,False,False,False,False,False,False,2101642
1,0.010942,0.106717,0.039197,0.392244,0.030886,0.082687,0.555540,0.213435,0.450900,0.700623,...,False,False,True,False,False,False,False,False,False,2101642
2,0.343075,0.409418,0.199307,0.098753,0.270152,0.365789,0.583102,0.136565,0.136981,0.723269,...,False,False,False,False,True,False,False,False,False,2122702
3,0.076316,0.939197,0.093213,0.935042,0.212881,0.859003,0.901108,0.525139,0.948753,0.897853,...,False,False,False,False,True,False,False,False,False,2122702
4,0.701316,0.006163,0.828947,0.013227,0.395983,0.082687,0.513643,0.208449,0.101039,0.363435,...,False,False,False,False,False,False,True,False,False,2122722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3605,0.567936,0.500554,0.664335,0.802493,0.395983,0.365789,0.182548,0.339681,0.510457,0.314197,...,True,True,True,True,True,True,True,True,True,3868229
3606,0.922438,0.904155,0.685042,0.517313,0.578393,0.035180,0.182548,0.288089,0.012119,0.047507,...,True,True,True,True,True,True,True,True,True,3868230
3607,0.842798,0.975346,0.878324,0.935042,0.667313,0.947438,0.182548,0.670706,0.895360,0.797576,...,True,True,True,True,True,True,True,True,True,3868230
3608,0.777008,0.834349,0.649377,0.846191,0.625831,0.603601,0.182548,0.606302,0.644737,0.504224,...,True,True,True,True,True,True,True,True,True,3868241


## Make a Da Model

In [47]:
seed = 123

In [None]:
# Set seeds
random.seed(seed)
np.random.seed(seed)

# Split into test and eval
X['id'] = players_wide['id'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_test, X_eval, y_test, y_eval = train_test_split(X_test, y_test, test_size=0.3)



# Define the base model
model = XGBClassifier(eval_metric='logloss', random_state=seed)

# Define parameter grid to search over
param_grid = {
    'max_depth': np.arange(3, 6, 1),
    'learning_rate': [.15],
    'n_estimators': np.arange(100, 250, 10),
    'subsample': [.8],
    'colsample_bytree': [.75]
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)

# Set up GridSearchCV   
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',   
    cv=cv,
    verbose=0,
    n_jobs=22
)

#grid_search.fit(X_train, y_train)

#best_model = grid_search.best_estimator_

#grid_search.best_params_



In [49]:
import joblib
model = joblib.load("../data/pkls/xgb.pkl")

importance = pd.Series(model.feature_importances_,name = "importance")
feature_names = pd.Series(model.feature_names_in_,name = "name")
summary = pd.concat([feature_names,importance],axis = 1).sort_values(by = 'importance',ascending=False)

# Grab necessary vars
score = model.score(X_test,y_test)
probs = model.predict_proba(X_test)
probs = (probs[:,1])

# Assign values to dict 
summary['score'] = score

summary

Unnamed: 0,name,importance,score
52,demoman_kpd,0.072401,0.916887
29,scout_kpd_2,0.054439,0.916887
4,scout_kpd_1,0.053795,0.916887
5,soldier_kpd_1,0.034054,0.916887
30,soldier_kpd_2,0.033322,0.916887
...,...,...,...
84,gullywash,0.000000,0.916887
86,process,0.000000,0.916887
87,product,0.000000,0.916887
88,snakewater,0.000000,0.916887


## Make Per


### Bad Old Method

- Define columns to be set to 0 or 1

In [75]:
cols_to_zero = ['dapd', 'dapm', 'kpd',
       'offclass_pct', 'kill_pct', 'cpc_pct', 'assists_pct',
       'hr_pct', 'hroi_real', 'medkits_hpps','avg_uber_length','uberspm','healps']

cols_to_one = ['dt_pct','dt_real_pct','deaths_pct',
               'avg_time_before_healing','avg_time_before_using',
                 'avg_time_to_build','biggest_advantage_lost',
                 'deaths_with_95_99_uber_rate',
                 'deaths_within_20s_after_uber_rate', 'drops_rate']


def remove_player(df,num,class_name):
    df = df.copy()
    for col in df.columns:
        if class_name not in col or num not in col:
            continue

        if any(name in col for name in cols_to_zero):
            df[col] = .02
        elif any(name in col for name in cols_to_one):
            df[col] = .98
    return df

remove_player_probs = pd.DataFrame()

probs = model.predict_proba(X_test)
probs = probs[:,1]
probs = pd.Series(probs,name = "prediction")

remove_player_probs = pd.concat([remove_player_probs,probs],axis = 1)


for class_name,num in zip(class_name_list,num_list):

    name = "prediction_no_" + class_name + "_" + num
    name2 = "pct_no_" + class_name + "_" + num
    X_test_player_remove = remove_player(df = X_test,class_name = class_name,num = num)

    player_removed_probs = model.predict_proba(X_test_player_remove)

    player_removed_probs = player_removed_probs[:,1]

    player_removed_probs = pd.Series(player_removed_probs,name = name)

    player_removed_probs_pct = pd.Series(player_removed_probs / remove_player_probs['prediction'],name = name2)

    remove_player_probs = pd.concat([remove_player_probs,
                                     player_removed_probs,
                                     player_removed_probs_pct],
                                     axis = 1)



In [None]:
import shap

explainer = shap.TreeExplainer(model)

In [None]:
class_name_list = ['scout','scout','soldier','soldier','medic','demoman']
num_list = ['1','2','1','2','','']


sum_by_class_list = []

for df in [X_test,X_eval]:
    shap_values = explainer.shap_values(df)

    shap_values = pd.DataFrame(shap_values)
    shap_values.columns = df.columns

    shap_values.drop([col for col in shap_values.columns if col in valid_map_names],axis = 1,inplace = True)

    class_names = []

    shap_values_flip = shap_values.T.copy()

    for index in shap_values_flip.index:
        for class_name,num in zip(class_name_list,num_list):
            if class_name in index and num in index:
                class_names.append(class_name + num)
    shap_values.drop([col for col in shap_values.columns if col in valid_map_names],axis = 1,inplace = True)

    shap_values_flip['test'] = class_names

    sum_by_class = shap_values_flip.groupby('test').sum().T
    sum_by_class_list.append(sum_by_class)

In [97]:
import pandas as pd
import numpy as np

from scipy.stats import percentileofscore

def get_quantile_series(reference_series, target_series):
    return target_series.apply(lambda x: percentileofscore(reference_series, x, kind='mean') / 100)


In [99]:
quantiled_df = pd.DataFrame({
    col: get_quantile_series(sum_by_class_list[0][col], sum_by_class_list[1][col])
    for col in sum_by_class_list[1].columns
})


In [120]:
per_test = sum_by_class_list[0].rank(pct = True)
per_test['winner'] = y_test.values

win_per = per_test[per_test['winner']==1]

In [122]:
per_df = (quantiled_df * 10).round(2)
per_df['winner'] = y_eval.values
per_df.groupby("winner").mean()

Unnamed: 0_level_0,demoman,medic,scout1,scout2,soldier1,soldier2
winner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.700608,3.998243,3.270811,3.572635,3.233581,3.562162
1,6.384407,6.289548,6.414633,6.773672,6.392655,6.120621


In [116]:
b = per_df[per_df['winner'] == 1]
b

Unnamed: 0,demoman,medic,scout1,scout2,soldier1,soldier2,winner
1,8.96,2.98,5.67,9.17,8.19,4.67,1
3,6.17,4.53,8.59,1.15,6.31,3.44,1
4,9.30,7.01,1.50,4.97,7.81,2.84,1
9,8.31,0.71,4.91,7.70,8.55,4.83,1
10,5.11,9.12,5.67,7.81,9.63,7.52,1
...,...,...,...,...,...,...,...
318,8.52,1.99,2.37,6.36,0.66,1.72,1
319,4.18,4.37,7.45,7.18,7.35,8.98,1
321,3.61,7.90,8.60,2.08,8.96,3.65,1
322,7.06,5.90,8.11,2.41,5.16,5.74,1


In [123]:
per_df

Unnamed: 0,demoman,medic,scout1,scout2,soldier1,soldier2,winner
0,3.21,6.98,4.43,4.59,7.02,2.37,0
1,8.96,2.98,5.67,9.17,8.19,4.67,1
2,1.99,2.97,3.03,0.74,0.94,0.46,0
3,6.17,4.53,8.59,1.15,6.31,3.44,1
4,9.30,7.01,1.50,4.97,7.81,2.84,1
...,...,...,...,...,...,...,...
320,1.48,9.39,3.89,6.58,1.49,7.85,0
321,3.61,7.90,8.60,2.08,8.96,3.65,1
322,7.06,5.90,8.11,2.41,5.16,5.74,1
323,0.29,1.45,7.85,7.22,7.16,4.75,0
