## Setup

In [1]:
### SETUP ###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import random
from plyer import notification
from scipy.stats import percentileofscore
import matplotlib.pyplot as plt

pd.set_option('future.no_silent_downcasting', True)

data = joblib.load("../data/pkls/rgl_df_dict.pkl")

# Load necessary data

players = data['players']
teams = data['teams']

players['suicide_rate'] = players['suicides'].div(players['deaths']).astype(float).round(4)

team_medic_stats = data['team_medic_stats']
info = data['info']

## Data Manips
- Detect if the teams follow the medic,2 scout, 2 soldier, 1 demo

In [142]:
team_comp = players.groupby(['id', 'team'])['primary_class'].agg(lambda x: ".".join(x)).reset_index(name='class_concat')

team_comps = (team_comp['class_concat'].str.split("."))

# Test if team_comp is correct
correct = []
for team in team_comps:
    if len(team) != 6:
        correct.append(0)
        continue
    demoman = 0
    soldier = 0 
    scout = 0
    medic = 0
    for class_name in team:
        if class_name == 'demoman':
            demoman += 1
        if class_name == 'soldier':
            soldier += 1
        if class_name == 'scout':
            scout += 1
        if class_name == 'medic':
            medic += 1
    if demoman == 1 and soldier == 2 and scout == 2 and medic == 1:
        correct.append(1)
    else:
        correct.append(0)

team_comp['correct'] = correct

team_comp = team_comp.groupby('id').agg(correct_team_comp = ('correct','sum'))

team_comp = team_comp[team_comp['correct_team_comp'] == 2]

players = players[players['id'].isin(team_comp.reset_index()['id'])]

- Remove short matches

In [143]:
short_matches = info[info['length'] < 450]

players = players[~players['id'].isin(short_matches['id'])]

- Remove non-valid maps

In [144]:
## Check if map is valid
# If the map doesnt have at least 50 plays, not valid.
# Sometimes people just upload as "sunshine" need to ensure there are only single maps

# Find valid map names
play_count = 50
info_correct = info[info['id'].isin(players['id'])].copy()
maps = info_correct['map'].str.lower().str.split("_")
map_counts = pd.Series(maps.str[1].value_counts())

valid_maps = map_counts[map_counts > play_count]
valid_map_names = valid_maps.index

# Grab the first and second word of the mapname
first_map_word_length = maps.str[0].str.split(" ").apply(len)
first_map_word = maps.str[0].str.split(" ").str[0]
second_map_word = maps.str[1]

first_map_length_check = first_map_word_length == 1
first_map_check = first_map_word.isin(valid_map_names)
second_map_check = second_map_word.isin(valid_map_names)

info_correct['map_check'] = (first_map_length_check & first_map_check) | (second_map_check)

correct_map = info_correct[info_correct['map_check'] == True].copy()

players = players[players['id'].isin(correct_map['id'])]

- Map name

In [145]:
map_list = []
for map in correct_map['map'].str.lower().values:
    for map_name in valid_map_names:
        if map_name in map:
            map_list.append(map_name)

correct_map['map_name'] = map_list

In [146]:
map_list = []
for map in correct_map['map'].str.lower().values:
    for map_name in valid_map_names:
        if map_name in map:
            map_list.append(map_name)

correct_map['map_name'] = map_list

- Fix class_names to have 1's and 2's

In [147]:
import random
# Copy the DataFrame so we don't overwrite the original
players_fixed = players.copy()

# Group by match id and team
grouped = players_fixed.groupby(['id', 'team'])

# Function to randomly rename duplicate classes within each group
def rename_classes_randomly(df):
    random.seed(123)
    np.random.seed(123)
    df = df.copy()  # avoid SettingWithCopyWarning
    for cls in ['scout', 'soldier']:
        indices = df.index[df['primary_class'] == cls].tolist()
        if len(indices) == 2:
            # Randomly shuffle the suffixes
            suffixes = [f"{cls}_1", f"{cls}_2"]
            random.shuffle(suffixes)
            for i, idx in enumerate(indices):
                df.at[idx, 'primary_class'] = suffixes[i]
    return df

# Apply function to each group
players_fixed = grouped.apply(rename_classes_randomly, include_groups=False).reset_index()

# Drop the redundant index column
players_fixed.drop('level_2', axis=1, inplace=True)


- Drop Appropriate Columns

In [148]:
drop_cols = [ 'primary_class_time', 'name',
       'assists', 'cpc','heal', 'hr','deaths', 'dmg', 'dmg_real', 'drops',
       'dt', 'dt_real','kills','medkits','medkits_hp','sentries', 
       'suicides','ka','offclass_time','total_time','kapd','ka','ka_pct','hroi',"dmg_real_pct",
       "dmg_pct",'suicide_rate']

players_fixed.drop(drop_cols,axis =1,inplace = True)

- Make the combat_players and medic_players datasets

In [149]:
combat_classes = ['scout', 'soldier', 'demoman']
pattern = '|'.join(combat_classes)  # Creates 'scout|soldier|demoman'
combat_players = players_fixed[players_fixed['primary_class'].str.contains(pattern, case=False, na=False)].copy()

# Make medic stats
medic_players = players_fixed[players_fixed['primary_class'] == 'medic'].copy()

# Bind in team_medic
medic_players = medic_players.merge(team_medic_stats,on= ['id','team'])


# Drop Bad columns
drop_medic = ['offclass_pct','hroi_real','hr_pct',
              'medicstats.advantages_lost','medicstats.deaths_with_95_99_uber',
       'medicstats.deaths_within_20s_after_uber', 'ubers', 'drops',
       'medic_deaths', 'exchanges_initiated', 'drops_forced',
       'successful_ubers', 'medic_deaths_forced', 'exchanges_not_initiated',
       'successful_uber_rate', 'forced_medic_death_rate', 'forced_drop_rate',
       'medic_deaths_capitalized', 'round_losing_medic_deaths',
       'round_losing_medic_death_rate', 'medic_death_capitalization_rate',
       'advantages_lost_per_round']

drop_combat = ['healps']

combat_players.drop(drop_combat,axis = 1,inplace = True)

medic_players.drop(drop_medic,axis = 1,inplace = True)

# Make columns numeric and fillna with 0
non_numeric_columns = ['id', 'team', 'primary_class','steamid']
for df in [medic_players,combat_players]:
       for col in df.columns:
              if col in non_numeric_columns:
                     continue
              df[col] = pd.to_numeric(df[col])
       df.fillna(0,inplace=True)

# Remove medicstats. from colnames

medic_players.columns = [col.replace("medicstats.","") for col in medic_players.columns]
       

- Widen the datasets

In [150]:
index_columns = ['id', 'team', 'primary_class']
combat_wide = (
    combat_players
    .set_index(index_columns)  # MultiIndex
    .unstack('primary_class')                    # Pivot on class
)

medic_wide = (
    medic_players
    .set_index(index_columns)  # MultiIndex
    .unstack('primary_class')                        # Pivot on class
)

# Step 3: Flatten the MultiIndex column names
combat_wide.columns = [f"{cls}_{stat}" for stat, cls in combat_wide.columns]

# Step 4: Reset index
combat_wide = combat_wide.reset_index()

# Drop non-scout offclass 
cols = [col for col in combat_wide.columns if 'offclass' in col and 'scout' not in col]
combat_wide = combat_wide.drop(cols,axis = 1)

# Step 3: Flatten the MultiIndex column names
medic_wide.columns = [f"{cls}_{stat}" for stat, cls in medic_wide.columns]

# Step 4: Reset index
medic_wide = medic_wide.reset_index()



- Merge Combat, Medic, Mapname and Winner

In [151]:
# Remove duplicate columns
medic_merger = medic_wide.drop(['id','team'],axis =1)

# Merge medic and combat
players_wide = pd.concat([combat_wide,medic_merger],axis = 1)

players_wide = players_wide.merge(teams[['id','team','winner']],on =['id','team'])

- Make Model Datasets

In [152]:
drop_cols = ['id','team','winner'] + [col for col in players_wide if 'steamid' in col]

X = players_wide.drop(drop_cols,axis = 1).copy()

y = players_wide['winner']

- Rank Normalize Data

In [153]:
# Rank the scout and soldier data based on the entire dataset, not the pivot version
scout_soldier = X[[col for col in X.columns if 'scout' in col or 'soldier' in col]].copy()

# Turn the data into long
scout_soldier_long = pd.DataFrame()
for index in ['1','2']:
    df = scout_soldier[[col for col in scout_soldier.columns if index in col]].copy()
    df.columns = [col.replace("_"+index,"") for col in df.columns]
    df['num']= index
    scout_soldier_long = pd.concat([scout_soldier_long,df])

# Drop index and rank
num = scout_soldier_long['num']
scout_soldier_long.drop("num",axis =1,inplace = True)
ranked_scout_soldier = scout_soldier_long.rank(pct=True)

# Re attach index
ranked_scout_soldier['num'] = num

# Widen the datset again
scout_soldier = pd.DataFrame()
for index in ['1','2']:
    df = ranked_scout_soldier[ranked_scout_soldier['num'] == index].copy()
    df.drop('num',axis = 1,inplace = True)
    df.columns = [col + "_" + index for col in df.columns]
    scout_soldier = pd.concat([scout_soldier,df],axis = 1)


# Rank the medic and demo stats
medic_demo = X[[col for col in X.columns if 'scout' not in col and 'soldier' not in col]].copy()

medic_demo = medic_demo.rank(pct=True)

# Merge data back together

X = pd.concat([scout_soldier,medic_demo],axis = 1)

In [154]:
# # Map name
# team_maps = players_wide.merge(correct_map[['id','map_name']],on = 'id')['map_name']
# map_dummies = pd.get_dummies(team_maps)
# X = pd.concat([X,map_dummies],axis =1 )
# X = X.astype({col: bool for col in X.select_dtypes(include='object').columns})


In [155]:
corr_mat = X.corr().abs()

for i in corr_mat.index:
    for j in corr_mat.columns:
        if i == j:
            continue
        corr = corr_mat.loc[i,j]
        if corr > .75:
            s = f'{i} x {j}'
            print(s)

scout_dapd_1 x scout_kpd_1
soldier_dapd_1 x soldier_kpd_1
scout_kpd_1 x scout_dapd_1
soldier_kpd_1 x soldier_dapd_1
scout_dapd_2 x scout_kpd_2
soldier_dapd_2 x soldier_kpd_2
scout_kpd_2 x scout_dapd_2
soldier_kpd_2 x soldier_dapd_2
demoman_dapd x demoman_kpd
demoman_kpd x demoman_dapd
medic_dapd x medic_dapm
medic_dapm x medic_dapd
medic_kpd x medic_kill_pct
medic_kill_pct x medic_kpd


## Make a Da Model

In [156]:
seed = 123

In [157]:
# Set seeds
random.seed(seed)
np.random.seed(seed)

# Split into test and eval
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X_test, X_eval, y_test, y_eval = train_test_split(X_test, y_test, test_size=0.3)



# Define the base model
model = XGBClassifier(eval_metric='logloss', random_state=seed)

# Define parameter grid to search over
param_grid = {
    'max_depth': np.arange(3, 6, 1),
    'learning_rate': [.15],
    'n_estimators': np.arange(100, 250, 10),
    'subsample': [.8],
    'colsample_bytree': [.75]
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=6, shuffle=True, random_state=seed)

# Set up GridSearchCV   
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',   
    cv=cv,
    verbose=0,
    n_jobs=22
)

#grid_search.fit(X_train, y_train)

#best_model = grid_search.best_estimator_

#grid_search.best_params_



In [158]:
data_list = joblib.load('../data/pkls/model_ready_data_list.pkl')
data_list[0]

Unnamed: 0,scout_dapd_1,soldier_dapd_1,scout_dapm_1,soldier_dapm_1,scout_kpd_1,soldier_kpd_1,scout_offclass_pct_1,scout_kill_pct_1,soldier_kill_pct_1,scout_deaths_pct_1,...,medic_medkits_hpps,medic_avg_time_before_healing,medic_avg_time_before_using,medic_avg_time_to_build,medic_avg_uber_length,medic_biggest_advantage_lost,medic_deaths_with_95_99_uber_rate,medic_deaths_within_20s_after_uber_rate,medic_drops_rate,medic_uberspm
0,0.969421,0.969114,0.970034,0.727698,0.987508,0.973942,0.180717,0.939224,0.266248,0.262032,...,0.946045,0.987738,0.081392,0.362661,0.433323,0.166922,0.390558,0.405426,0.819742,0.911097
1,0.006591,0.104537,0.035331,0.388412,0.024142,0.078173,0.556101,0.208768,0.452177,0.703709,...,0.359289,0.839975,0.027897,0.282189,0.976395,0.166922,0.390558,0.066983,0.226548,0.023605
2,0.336527,0.411251,0.192673,0.096260,0.262876,0.366493,0.583231,0.130595,0.134197,0.725782,...,0.607296,0.223483,0.396229,0.544758,0.878909,0.166922,0.390558,0.668455,0.226548,0.878142
3,0.070202,0.942980,0.086987,0.936159,0.205166,0.860515,0.903280,0.525061,0.951870,0.899525,...,0.321122,0.238351,0.317903,0.300429,0.791845,0.166922,0.844727,0.703709,0.226548,0.790466
4,0.699877,0.004675,0.826793,0.011343,0.390635,0.078173,0.514792,0.203326,0.097103,0.362431,...,0.945586,0.293991,0.669068,0.169988,0.476242,0.382281,0.390558,0.983292,0.226548,0.308400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257,0.551579,0.772302,0.902131,0.961833,0.776671,0.819742,0.180717,0.904736,0.743102,0.637799,...,0.500920,0.559319,0.234979,0.703096,0.262416,0.166922,0.833998,0.501380,0.226548,0.825874
3258,0.826410,0.736281,0.849785,0.920601,0.803801,0.696582,0.180717,0.663243,0.684166,0.250383,...,0.959227,0.318516,0.148375,0.378602,0.562998,0.625996,0.390558,0.145616,0.507664,0.853771
3259,0.563841,0.503985,0.660025,0.802269,0.390635,0.366493,0.180717,0.335684,0.512492,0.312922,...,0.623084,0.183017,0.100399,0.438535,0.567290,0.444206,0.390558,0.464899,0.226548,0.853771
3260,0.777131,0.839056,0.644620,0.846567,0.623774,0.606453,0.180717,0.608752,0.646689,0.505212,...,0.937155,0.204169,0.224402,0.067443,0.683476,0.879062,0.390558,0.251686,0.543532,0.968118


In [159]:
import joblib
model = joblib.load("../data/pkls/xgb.pkl")

importance = pd.Series(model.feature_importances_,name = "importance")
feature_names = pd.Series(model.feature_names_in_,name = "name")
summary = pd.concat([feature_names,importance],axis = 1).sort_values(by = 'importance',ascending=False)

# Grab necessary vars
score = model.score(X_test,y_test)
probs = model.predict_proba(X_test)
probs = (probs[:,1])

# Assign values to dict 
summary['score'] = score

summary

Unnamed: 0,name,importance,score
52,demoman_kpd,0.077209,0.886214
4,scout_kpd_1,0.045013,0.886214
29,scout_kpd_2,0.044176,0.886214
5,soldier_kpd_1,0.033010,0.886214
0,scout_dapd_1,0.032977,0.886214
...,...,...,...
54,demoman_deaths_pct,0.004168,0.886214
63,medic_dapm,0.003041,0.886214
7,scout_kill_pct_1,0.002560,0.886214
45,soldier_hr_pct_2,0.002435,0.886214


## Make Per


### Bad Old Method

- Define columns to be set to 0 or 1

In [67]:
class_name_list = ['scout','scout','soldier','soldier','medic','demoman']
num_list = ['1','2','1','2','','']
cols_to_zero = ['dapd', 'dapm', 'kpd',
       'offclass_pct', 'kill_pct', 'cpc_pct', 'assists_pct',
       'hr_pct', 'hroi_real', 'medkits_hpps','avg_uber_length','uberspm','healps']

cols_to_one = ['dt_pct','dt_real_pct','deaths_pct',
               'avg_time_before_healing','avg_time_before_using',
                 'avg_time_to_build','biggest_advantage_lost',
                 'deaths_with_95_99_uber_rate',
                 'deaths_within_20s_after_uber_rate', 'drops_rate']


def remove_player(df,num,class_name):
    df = df.copy()
    for col in df.columns:
        if class_name not in col or num not in col:
            continue

        if any(name in col for name in cols_to_zero):
            df[col] = .02
        elif any(name in col for name in cols_to_one):
            df[col] = .98
    return df

remove_player_probs = pd.DataFrame()

probs = model.predict_proba(X_test)
probs = probs[:,1]
probs = pd.Series(probs,name = "prediction")

remove_player_probs = pd.concat([remove_player_probs,probs],axis = 1)


for class_name,num in zip(class_name_list,num_list):

    name = "prediction_no_" + class_name + "_" + num
    name2 = "pct_no_" + class_name + "_" + num
    X_test_player_remove = remove_player(df = X_test,class_name = class_name,num = num)

    player_removed_probs = model.predict_proba(X_test_player_remove)

    player_removed_probs = player_removed_probs[:,1]

    player_removed_probs = pd.Series(player_removed_probs,name = name)

    player_removed_probs_pct = pd.Series(player_removed_probs / remove_player_probs['prediction'],name = name2)

    remove_player_probs = pd.concat([remove_player_probs,
                                     player_removed_probs,
                                     player_removed_probs_pct],
                                     axis = 1)



### New Per Method

In [160]:
import shap

explainer = shap.TreeExplainer(model)

In [161]:
class_name_list = ['scout','scout','soldier','soldier','medic','demoman']
num_list = ['1','2','1','2','','']


sum_by_class_list = []

for df in [X_test,X_eval]:
    shap_values = explainer.shap_values(df)

    shap_values = pd.DataFrame(shap_values)
    shap_values.columns = df.columns

    shap_values.drop([col for col in shap_values.columns if col in valid_map_names],axis = 1,inplace = True)

    class_names = []

    shap_values_flip = shap_values.T.copy()

    for index in shap_values_flip.index:
        for class_name,num in zip(class_name_list,num_list):
            if class_name in index and num in index:
                class_names.append(class_name + num)
    shap_values.drop([col for col in shap_values.columns if col in valid_map_names],axis = 1,inplace = True)

    shap_values_flip['test'] = class_names

    sum_by_class = shap_values_flip.groupby('test').sum().T
    sum_by_class_list.append(sum_by_class)

In [162]:

def get_quantile_series(reference_series, target_series):
    return target_series.apply(lambda x: percentileofscore(reference_series, x, kind='mean') / 100)


In [163]:
quantiled_df = pd.DataFrame({
    col: get_quantile_series(sum_by_class_list[0][col], sum_by_class_list[1][col])
    for col in sum_by_class_list[1].columns
})


In [164]:
per_test = sum_by_class_list[0].rank(pct = True)
per_test['winner'] = y_test.values

win_per = per_test[per_test['winner']==1]

In [165]:
per_df = (quantiled_df * 10).round(2)
per_df['winner'] = y_eval.values
per_df.groupby("winner").mean()

Unnamed: 0_level_0,demoman,medic,scout1,scout2,soldier1,soldier2
winner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.691845,3.880194,3.406117,3.09068,3.809515,3.713398
1,6.36043,5.830968,6.086882,6.033763,6.444624,6.716989


In [None]:
s = "qseqSA"
s.lower()

'qseqsa'

: 

In [171]:
per_test

test,demoman,medic,scout1,scout2,soldier1,soldier2,winner
0,0.287129,0.005941,0.106931,0.128713,0.132673,0.524752,0
1,0.415842,0.112871,0.061386,0.312871,0.013861,0.849505,0
2,0.689109,0.782178,0.964356,0.330693,0.772277,0.687129,1
3,0.833663,0.798020,0.043564,0.506931,0.819802,0.495050,1
4,0.928713,0.906931,0.542574,0.061386,0.974257,0.801980,1
...,...,...,...,...,...,...,...
500,0.542574,0.623762,0.445545,0.877228,0.916832,0.897030,1
501,0.936634,0.340594,0.021782,0.530693,0.566337,0.500990,0
502,0.356436,0.392079,0.699010,0.473267,0.568317,0.758416,0
503,0.134653,0.255446,0.247525,0.190099,0.483168,0.269307,0
