In [39]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import itertools
from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import ast

In [40]:
files = ['./data/BattlesStaging_01012021_WL_tagged.csv', './data/BattlesStaging_01022021_WL_tagged.csv', 
         './data/BattlesStaging_01022021_WL_tagged.csv', './data/BattlesStaging_01042021_WL_tagged.csv',
         './data/battlesStaging_12272020_WL_tagged.csv', './data/battlesStaging_12282020_WL_tagged.csv',
         './data/BattlesStaging_12292020_WL_tagged.csv', './data/BattlesStaging_12302020_WL_tagged.csv',
         './data/BattlesStaging_12312020_WL_tagged.csv',]
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)
df.head()

KeyboardInterrupt: 

In [None]:
def canonical_deck(deck):
    if pd.isna(deck):
        return None

    if isinstance(deck, str):
        deck = ast.literal_eval(deck)

    deck_list = list(deck)

    deck_list = [int(x) for x in deck_list]
  
    # Sort and return a tuple (hashable key)
    return tuple(sorted(deck_list))

In [None]:
deck_stats = {}
df_final = df[['winner.cards.list', 'loser.cards.list']]

for _, row in df_final.iterrows():
    wdeck = canonical_deck(row['winner.cards.list'])
    ldeck = canonical_deck(row['loser.cards.list'])

    if wdeck is None or ldeck is None:
        continue

    if wdeck not in deck_stats:
        deck_stats[wdeck] = {'wins': 0, 'losses': 0, 'games': 0}
    if ldeck not in deck_stats:
        deck_stats[ldeck] = {'wins': 0, 'losses': 0, 'games': 0}

    deck_stats[wdeck]['wins'] += 1
    deck_stats[wdeck]['games'] += 1
    deck_stats[ldeck]['losses'] += 1
    deck_stats[ldeck]['games'] += 1

In [None]:
rows = []
for deck, vals in deck_stats.items():
    wins = vals.get('wins', 0)
    losses = vals.get('losses', 0)
    games = vals.get('games', 0)
    winrate = wins / games if games > 0 else 0
    rows.append({'deck': deck, 'wins': wins, 'losses': losses, 'games': games, 'winrate': winrate})

# Build DataFrame
deck_stats_df = pd.DataFrame(rows)

# Convert the tuple deck back into a comma-separated string for readability
deck_stats_df['deck'] = deck_stats_df['deck'].apply(lambda d: ','.join(map(str, d)))

# Sort by winrate (descending)
deck_stats_df = deck_stats_df.sort_values('winrate', ascending=False).reset_index(drop=True)

# Filter out decks with fewer than min_games games
min_games = 75
original_count = len(deck_stats_df)
filtered = deck_stats_df[deck_stats_df['games'] >= min_games].reset_index(drop=True).copy()
filtered_count = len(filtered)
print(f"Filtered out {original_count - filtered_count} decks with fewer than {min_games} games; {filtered_count} decks remain.")

# Prepare final DataFrame (use a clean copy to avoid SettingWithCopyWarning)
df_final = filtered[['deck', 'winrate', 'wins', 'losses', 'games']].copy()

# Split deck string into card columns dynamically
split_cols = df_final['deck'].str.split(',', expand=True)
num_cards = split_cols.shape[1]
card_col_names = [f'card_{i+1}' for i in range(num_cards)]

# Convert to numeric (int) with nullable Int64 dtype and assign safely using .loc
for idx, col in enumerate(split_cols.columns):
    df_final.loc[:, card_col_names[idx]] = pd.to_numeric(split_cols[col].str.strip(), errors='coerce').astype('Int64')

# Reorder columns
col_order = ['deck', 'winrate', 'wins', 'losses', 'games'] + card_col_names
col_order = [c for c in col_order if c in df_final.columns]

df_final = df_final[col_order]

# Display results
print('\nTop 10 decks (after filtering):')
print(df_final.head(10).to_string(index=False))

# Keep df_final available for later use


In [None]:
df_final.head()

In [None]:
df_final.shape

In [None]:


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=42
)

In [None]:
from sklearn.linear_model import LogisticRegression

model_ngram = LogisticRegression(max_iter=100)
model_ngram.fit(X_train, y_train)

accuracy = model_ngram.score(X_test, y_test)
print(f"Model Accuracy: {accuracy:.2%}")

In [None]:
def predict_win_pobability(card_ids):
    x_input = mlb.transform([card_ids])
    prob = model_ngram.predict_proba(x_input)[0][1]
    return prob

In [None]:
good_deck = [26000003, 26000014, 26000021, 26000037, 26000005, 26000041, 28000008, 26000054]
bad_deck  = [27000015, 27000005, 27000001, 26000008, 26000002, 26000019, 26000012, 28000003]
print(predict_win_pobability(good_deck))
print(predict_win_pobability(bad_deck))