In [None]:
import numpy as np
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform, randint, loguniform
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRFClassifier

import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('training_targets.csv')

data['radiant_win'].value_counts()

In [None]:
# Encode the response variable
label_encoder = LabelEncoder()
data['radiant_win'] = label_encoder.fit_transform(data['radiant_win'])

In [None]:
def feature_selector(data):

    for team in ['r', 'd']:
        for i in range(1, 6):
            health = f'{team}{i}_health'
            max_health = f'{team}{i}_max_health'
            pct_health = f'{team}{i}_pct_end_health'
            data[pct_health] = (data[health] / data[max_health]) * 100

    # Calculate K/D ratio for each player
    for team in ['r', 'd']:
        for i in range(1, 6):
            kills = f'{team}{i}_kills'
            deaths = f'{team}{i}_deaths'
            assists = f'{team}{i}_assists'
            kd_ratio = f'{team}{i}_kd_ratio'
            last_hits = f'{team}{i}_lh'
            data[kd_ratio] = (data[kills] + data[assists] + data[last_hits]) / (data[deaths] + 1e-4)  # Prevent division by zero

    # Create new features
    team_stats = [
        'kills', 'deaths', 'level', 'assists', 'denies', 'gold', 'xp', 'lh',
        'health', 'max_health', 'stuns', 'creeps_stacked', 'camps_stacked',
        'rune_pickups', 'towers_killed', 'roshans_killed', 'obs_placed', 'sen_placed'
    ]

    for team in ['r', 'd']:
        for stat in team_stats:
            cols = [f'{team}{i}_{stat}' for i in range(1, 6)]
            new_col = f'{team}_total_{stat}'
            data[new_col] = data[cols].sum(axis=1)

            # Drop the old columns
            data.drop(cols, axis=1, inplace=True)

    # Prevent division by zero
    data['game_time'] = data['game_time'].replace(0, 1e-4)

    cols_per_second = ['gold', 'xp', 'kills', 'deaths', 'lh',
                       'creeps_stacked', 'camps_stacked', 'rune_pickups',
                       'towers_killed', 'roshans_killed', 'obs_placed', 'sen_placed']

    # Calculate per second stats for each team
    for team in ['r', 'd']:
        for col in cols_per_second:
            new_col = f'{team}_{col}_per_sec'
            calc_col = f'{team}_total_{col}'
            data[new_col] = data[calc_col] / (data['game_time'])

    ratio_cols = ['total_deaths', 'total_lh', 'total_gold', 'total_xp', 'total_stuns',
                  'total_denies', 'total_creeps_stacked', 'total_camps_stacked',
                  'total_rune_pickups', 'total_towers_killed', 'total_roshans_killed',
                  'total_obs_placed', 'total_sen_placed']

    # Calcuate ratios for each team
    for team in ['r', 'd']:
        for col in ratio_cols:
            new_col = f'{team}_kills_per_{col}'
            calc_col = f'{team}_{col}'
            data[new_col] = (data[f'{team}_{col}'] + 1e-4) / \
                (data[f'{team}_total_kills'] + 1e-4)

    both_team_cols = ['total_kills', 'total_deaths', 'total_lh', 'total_gold', 'total_xp',
                      'total_stuns', 'total_denies', 'total_creeps_stacked',
                      'total_camps_stacked', 'total_rune_pickups', 'total_towers_killed',
                      'total_roshans_killed', 'total_obs_placed', 'total_sen_placed']
    # Add total stats for both teams
    for col in both_team_cols:
        newcol = f'add_{col}'
        data[newcol] = data[f'r_{col}'] + data[f'd_{col}']

    # Subtract total stats for both teams
    for col in both_team_cols:
        newcol = f'sub_{col}'
        data[f'r_{newcol}'] = data[f'r_{col}'] - data[f'd_{col}']
        data[f'd_{newcol}'] = data[f'd_{col}'] - data[f'r_{col}']

    data["xp_to_gold_ratio"] = data["r_total_xp"] / (data["r_total_gold"] + 1e-6)
    data["resource_per_health"] = (data["r_total_gold"] + data["r_total_xp"]) / (data["r_total_health"] + 1e-6)

    data["r_kill_participation"] = data["r_total_assists"] / (data["add_total_kills"] + 1e-6)
    data["d_kill_participation"] = data["d_total_assists"] / (data["add_total_kills"] + 1e-6)

    data["r_damage_per_death"] = data["r_total_stuns"] / (data["r_total_deaths"] + 1e-6)
    data["d_damage_per_death"] = data["d_total_stuns"] / (data["d_total_deaths"] + 1e-6)

    data["r_aggression_ratio"] = data["r_total_kills"] / (data["r_total_towers_killed"] + 1e-6)
    data["d_aggression_ratio"] = data["d_total_kills"] / (data["d_total_towers_killed"] + 1e-6)

    data["gold_x_kills"] = data["r_total_gold"] * data["r_total_kills"]
    data["xp_x_deaths"] = data["r_total_xp"] * data["r_total_deaths"]

    data["log_r_gold"] = np.log1p(data["r_total_gold"])
    data["log_d_gold"] = np.log1p(data["d_total_gold"])

    return data.copy()

In [None]:
filtered_data = feature_selector(data)

filtered_data.head()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(filtered_data.drop(columns=['radiant_win', 'match_id_hash']),
                                                    filtered_data['radiant_win'],
                                                    test_size=0.3,
                                                    random_state=86
                                                    )

In [263]:
lscaler = StandardScaler()
tmp_X_train = lscaler.fit_transform(X_train)
tmp_X_train = pd.DataFrame(X_train, columns=filtered_data.drop(
    columns=['radiant_win', 'match_id_hash']).columns)

lmodel = LogisticRegression(max_iter=1000, random_state=86)
lmodel.fit(tmp_X_train, y_train)

pd.DataFrame(lmodel.coef_, columns=tmp_X_train.columns).T.sort_values(
    by=0, ascending=False)

Unnamed: 0,0
gold_x_kills,1.721700e-06
r5_kd_ratio,4.924542e-07
r4_kd_ratio,4.097548e-07
r1_kd_ratio,3.781210e-07
r2_kd_ratio,3.742282e-07
...,...
d1_kd_ratio,-3.322799e-07
d2_kd_ratio,-3.494988e-07
d4_kd_ratio,-3.546098e-07
d3_kd_ratio,-4.158785e-07


In [264]:
# Feature selection
# selector = RFECV(
#    XGBRFClassifier(),
#    step=1,
#    cv=StratifiedKFold(shuffle=True, random_state=86),
#    scoring='roc_auc',
#    n_jobs=-1
#    )
#
# selected_features = selector.fit(X_train, y_train)
#
# features_to_keep = X_train.columns[selected_features.support_]

features_to_keep = X_train.columns

In [265]:
# Create XGBRFClassifier model
hyper_params = {
    'n_estimators': randint(50, 1000),
    'max_depth': randint(2, 10),
    'learning_rate': uniform(0.01, 0.5),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

xgbrf_model = RandomizedSearchCV(
    XGBRFClassifier(),
    param_distributions=hyper_params,
    n_iter=10,
    scoring='f1',
    cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=86),
    n_jobs=-1,
    random_state=86
)

xgbrf_model.fit(X_train[features_to_keep], y_train)
print(f"Best Hyperparameters: {xgbrf_model.best_params_}")

# Print the F1 score for test and training data
best_model_train_pred = xgbrf_model.predict(X_train[features_to_keep])
print(f"Best Model Training F1 Score: {f1_score(y_train, best_model_train_pred)}")
best_model_test_pred = xgbrf_model.predict(X_test[features_to_keep])
print(f"Best Model Testing F1 Score: {f1_score(y_test, best_model_test_pred)}")

# Save the best model
with open('xgbrf_model.pkl', 'wb') as f:
    pickle.dump(xgbrf_model, f)

Best Hyperparameters: {'colsample_bytree': 0.6437645151243164, 'learning_rate': 0.3498089083467899, 'max_depth': 6, 'n_estimators': 204, 'subsample': 0.6381542262777193}
Best Model Training F1 Score: 0.7780183180682765
Best Model Testing F1 Score: 0.7540437212038036


In [None]:
# Import the validation data
validation_data = pd.read_csv('test_features.csv')

processed_validation_data = feature_selector(validation_data)
processed_validation_data = processed_validation_data.drop(columns=[
                                                           'match_id_hash'])

# Make predictions on the validation data
validation_predictions = xgbrf_model.predict(
    processed_validation_data[features_to_keep])

results = pd.DataFrame({
    'match_id_hash': validation_data['match_id_hash'],
    'radiant_win': label_encoder.inverse_transform(validation_predictions)
})
results.to_csv('submission.csv', index=False)

results.info()