In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import re
from PIL import Image
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
import clip
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from autogluon.features.generators import AutoMLPipelineFeatureGenerator
from autogluon.tabular.models.tabular_nn.torch.tabular_nn_torch import TabularNeuralNetTorchModel
from autogluon.core.constants import MULTICLASS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
torch.backends.cuda.sdp_kernel()

<contextlib._GeneratorContextManager at 0x1cddfc39510>

In [4]:
torch.backends.cuda.enable_flash_sdp(True)

In [5]:
# paths 

# hearthstone
DATA_PATH_HEARTHSTONE_RACE = '../dataset/Hearthstone-Minion-race/' #label: race
DATA_PATH_HEARTHSTONE_CARDCLASS = '../dataset/Hearthstone-All-cardClass/' # label: card class
DATA_PATH_HEARTHSTONE_ALLSET = '../dataset/Hearthstone-All-set/' # label: set
DATA_PATH_HEARTHSTONE_SPELLSCHOOL = '../dataset/Hearthstone-Spell-spellSchool/' # label: set

# pokemon
DATA_PATH_POKEMON_SECONDARY = '../dataset/Pokemon-secondary_type/'

In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device=device)

# Helper Functions

In [7]:
def preprocess_df(df, id):
    df['text'] = df['text'].fillna('unknown')
    df['artist'] = df['artist'].fillna('unknown')
    df['mechanics'] = df['mechanics'].fillna('unknown')
    if id:
        df['combined_text'] = df['name'].str.lower() + ' ' + df['id'].str.lower()  + ' ' + df['artist'].str.lower()  + ' ' + df['text'].str.lower()  + ' ' + df['mechanics'].str.lower() 
    else:
        df['combined_text'] = df['name'].str.lower() + ' ' + df['artist'].str.lower()  + ' ' + df['text'].str.lower()  + ' ' + df['mechanics'].str.lower() 
    return df

In [8]:
def combine_image_text(texts, images, tab_features, clip_model, clip_preprocess):
    image_text = []
    label_list = []
    tab_features = tab_features.to(device)
    for idx in tqdm(range(len(texts))):
        text = texts[idx]
        image = images[idx]
        text = clip.tokenize(text, truncate=True).to(device)
        image = clip_preprocess(Image.open(image)).unsqueeze(0).to(device)
        tab_feature = tab_features[idx].unsqueeze(0).to(device)
        with torch.no_grad():
            text_features = clip_model.encode_text(text)
            image_features = clip_model.encode_image(image)
        combined_features = torch.cat((text_features, image_features, tab_feature), 1)
        image_text.append(combined_features)
    return image_text

In [9]:
def process_df(df, data_path, tab_features, clip_model, clip_preprocess, id):
    df = preprocess_df(df, id)
    texts = list(df['combined_text'])
    images = [data_path + img for img in list(df['Image Path'])]
    data = combine_image_text(texts, images, tab_features, clip_model, clip_preprocess)
    return data

# Model Training and evaluation

In [10]:
class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx].clone().detach().to(torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return feature, label

In [11]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Model, self).__init__()
        self.fc_1 = nn.Linear(input_size, hidden_size)
        self.fc_2 = nn.Linear(hidden_size, hidden_size)
        self.fc_3 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.batch_norm_1 = nn.BatchNorm1d(hidden_size)
        self.batch_norm_2 = nn.BatchNorm1d(hidden_size)
        
    def forward(self, x):
        x = x.view(x.size(0), -1) 
        x = self.fc_1(x)
        x = self.batch_norm_1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.fc_2(x)
        x = self.batch_norm_2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc_3(x)
        return x

In [12]:
def train(model, train_loader, dev_loader):
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5, factor=0.5, verbose=True)
    patience = 5
    best_val_loss = np.inf
    early_stop_counter = 0
    for epoch in range(num_epochs):
        train_loss = 0.0
        val_loss = 0.0
        model.train()
        for feature, label in train_loader:
            feature = feature.to(device)
            label = label.to(device)
            output = model(feature)
            loss = criterion(output, label)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * feature.size(0)
            
        model.eval()
        with torch.no_grad():
            for dev_feature, dev_labels in dev_loader:
                dev_feature = dev_feature.to(device)
                dev_labels = dev_labels.to(device)
                output = model(dev_feature)
                loss = criterion(output, dev_labels)
                val_loss += loss.item() * dev_feature.size(0)
                    
        train_loss /= len(train_loader.dataset)
        val_loss /= len(dev_loader.dataset)
        scheduler.step(val_loss)
        print(f"Epoch [{epoch+1}/{num_epochs}]")
        print(f"  Train Loss: {train_loss:.4f} - Validation Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stop_counter = 0
        else:
            early_stop_counter += 1

        if early_stop_counter >= patience:
            print("Early stopping.")
            break

In [13]:
def evaluate_accuracy(model, test_loader):
    total_accuracy = 0.0
    total_samples = 0
    model.eval()
    with torch.no_grad():
        for feature, label in test_loader:
            feature = feature.to(device)
            label = label.to(device)
            output = model(feature)
            _, predicted = torch.max(output, 1)
            correct = (predicted == label).sum().item()
            total_accuracy += correct
            total_samples += label.size(0)
    accuracy = total_accuracy / total_samples
    print(f"Accuracy: {accuracy:.3f}")

In [14]:
def evaluate_log_loss(model, test_loader, labels):
    model.eval()
    log = 0.0
    with torch.no_grad():
        for feature, label in test_loader:
            feature = feature.to(device)
            label = label.to(device)
            output = model(feature)
            probabilities = F.softmax(output, dim=1)
            log += log_loss(label.cpu().numpy(), probabilities.cpu().numpy(), labels=labels)
    log = log / len(test_loader)
    print(f"Log Loss: {log:.3f}")

In [15]:
def get_tab_features(df, labels):
    auto_ml = AutoMLPipelineFeatureGenerator(
    enable_text_special_features=False, 
    enable_text_ngram_features=False, 
    enable_vision_features=False,
    )
    auto_ml.fit(df)
    #auto_ml.print_feature_metadata_info(log_level=40)

    features = auto_ml.transform(df)

    tab_model = TabularNeuralNetTorchModel()
    tab_model.problem_type = MULTICLASS
    tab_model.quantile_levels = None
    tab_model._set_default_params()
    params = tab_model._get_model_params()
    processor_kwargs, optimizer_kwargs, fit_kwargs, loss_kwargs, params = tab_model._prepare_params(params)
    tab_model._preprocess_set_features(features)
    dataset, _ = tab_model._generate_datasets(features, labels, processor_kwargs)
    return torch.tensor(tab_model.processor.transform(features), dtype=torch.float32)

# Set

In [16]:
df_train = pd.read_csv(DATA_PATH_HEARTHSTONE_ALLSET + "/train.csv")
df_test = pd.read_csv(DATA_PATH_HEARTHSTONE_ALLSET + "/test.csv")
df_dev = pd.read_csv(DATA_PATH_HEARTHSTONE_ALLSET + "/dev.csv")

In [17]:
tab_data_train = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'type', 'collectible', 'spellSchool', 'race', 'durability', 'overload', 'spellDamage']].copy()
tab_data_test = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'type', 'collectible', 'spellSchool', 'race', 'durability', 'overload', 'spellDamage']].copy()
tab_data_dev = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'type', 'collectible', 'spellSchool', 'race', 'durability', 'overload', 'spellDamage']].copy()

In [18]:
le = LabelEncoder()
train_labels = le.fit_transform(df_train['set'])
test_labels = le.transform(df_test['set'])
dev_labels = le.transform(df_dev['set'])

In [19]:
tab_features_train = get_tab_features(tab_data_train, train_labels)
tab_features_test = get_tab_features(tab_data_test, train_labels)
tab_features_dev = get_tab_features(tab_data_dev, train_labels)

In [20]:
train_data = process_df(df_train, DATA_PATH_HEARTHSTONE_ALLSET, tab_features_train, clip_model, clip_preprocess, False)
test_data = process_df(df_test, DATA_PATH_HEARTHSTONE_ALLSET, tab_features_test, clip_model, clip_preprocess, False)
dev_data = process_df(df_dev, DATA_PATH_HEARTHSTONE_ALLSET, tab_features_dev, clip_model, clip_preprocess, False)

100%|██████████████████████████████████████████████████████████████████████████████| 8548/8548 [12:41<00:00, 11.22it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1603/1603 [02:18<00:00, 11.58it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 532/532 [00:53<00:00, 10.02it/s]


In [21]:
# initialized model
input_size = train_data[0].size(1)
hidden_size = 128
num_epochs = 30
num_classes = len(le.classes_)
lr = 0.0002
batch_size = 8
    
model = Model(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [22]:
# initialize data loader
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
dev_dataset = CustomDataset(dev_data, dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [23]:
# train model and evaluation
train(model, train_loader, dev_loader)
evaluate_accuracy(model, test_loader)
evaluate_log_loss(model, test_loader, np.unique(test_labels))

Epoch [1/30]
  Train Loss: 3.1638 - Validation Loss: 3.1629
Epoch [2/30]
  Train Loss: 2.7841 - Validation Loss: 2.9769
Epoch [3/30]
  Train Loss: 2.5876 - Validation Loss: 2.8535
Epoch [4/30]
  Train Loss: 2.4450 - Validation Loss: 2.6927
Epoch [5/30]
  Train Loss: 2.3098 - Validation Loss: 2.6129
Epoch [6/30]
  Train Loss: 2.2220 - Validation Loss: 2.5681
Epoch [7/30]
  Train Loss: 2.1579 - Validation Loss: 2.4915
Epoch [8/30]
  Train Loss: 2.0882 - Validation Loss: 2.4267
Epoch [9/30]
  Train Loss: 2.0371 - Validation Loss: 2.4278
Epoch [10/30]
  Train Loss: 2.0003 - Validation Loss: 2.4236
Epoch [11/30]
  Train Loss: 1.9480 - Validation Loss: 2.3686
Epoch [12/30]
  Train Loss: 1.9039 - Validation Loss: 2.4396
Epoch [13/30]
  Train Loss: 1.8616 - Validation Loss: 2.3264
Epoch [14/30]
  Train Loss: 1.8323 - Validation Loss: 2.3489
Epoch [15/30]
  Train Loss: 1.7889 - Validation Loss: 2.3025
Epoch [16/30]
  Train Loss: 1.7726 - Validation Loss: 2.2800
Epoch [17/30]
  Train Loss: 1.785

In [198]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, encoding_dim)
        )

        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(True),
            nn.Linear(128, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [188]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
td = scaler.fit_transform(tab_data)

In [192]:
td = torch.tensor(td, dtype=torch.float32)

In [199]:
input_dim = td.shape[1]
encoding_dim = 32
num_epochs = 50
batch_size = 32

model = AutoEncoder(input_dim, encoding_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [267]:
for epoch in range(num_epochs):
    for i in range(0, len(td), batch_size):
        inputs = td[i:i+batch_size].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
    print(f"Epoch: {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x12 and 1056x128)

In [225]:
encoder = model.encoder.to(device)
tab_features = encoder(td.to(device))
tab_features

tensor([[ 0.8379,  0.1642, -0.7523,  ..., -1.2896,  1.2782,  3.5073],
        [-2.4339,  0.9879,  0.4982,  ..., -2.3031,  0.5396, -0.4554],
        [ 0.0853,  1.5901, -0.7655,  ...,  3.1013, -1.0537,  1.5004],
        ...,
        [-0.2787,  3.8619,  0.1558,  ..., -0.0656,  0.1702,  1.7958],
        [ 0.5740,  1.6517, -0.1291,  ..., -5.8594,  1.3804, -0.7474],
        [-1.1256,  3.7655,  0.2812,  ...,  0.7918,  0.1051,  1.8746]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

# Pokemon secondary type

In [24]:
df_train = pd.read_csv(DATA_PATH_POKEMON_SECONDARY + "/train.csv")
df_test = pd.read_csv(DATA_PATH_POKEMON_SECONDARY + "/test.csv")
df_dev = pd.read_csv(DATA_PATH_POKEMON_SECONDARY + "/dev.csv")

In [25]:
df_train.columns

Index(['name', 'generation', 'status', 'species', 'type_2', 'height_m',
       'weight_kg', 'abilities_number', 'ability_1', 'ability_2',
       'ability_hidden', 'total_points', 'hp', 'attack', 'defense',
       'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'base_friendship',
       'base_experience', 'growth_rate', 'percentage_male', 'Image Path'],
      dtype='object')

In [26]:
tab_data_train = df_train[['generation', 'status', 'height_m', 'weight_kg', 'abilities_number', 'total_points', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'base_friendship', 'base_experience', 'growth_rate', 'percentage_male']].copy()
tab_data_test = df_train[['generation', 'status', 'height_m', 'weight_kg', 'abilities_number', 'total_points', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'base_friendship', 'base_experience', 'growth_rate', 'percentage_male']].copy()
tab_data_dev = df_train[['generation', 'status', 'height_m', 'weight_kg', 'abilities_number', 'total_points', 'hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed', 'catch_rate', 'base_friendship', 'base_experience', 'growth_rate', 'percentage_male']].copy()

In [27]:
le = LabelEncoder()
train_labels = le.fit_transform(df_train['type_2'])
test_labels = le.transform(df_test['type_2'])
dev_labels = le.transform(df_dev['type_2'])

In [28]:
tab_features_train = get_tab_features(tab_data_train, train_labels)
tab_features_test = get_tab_features(tab_data_test, train_labels)
tab_features_dev = get_tab_features(tab_data_dev, train_labels)

In [29]:
df_train['combined'] = df_train['name'] + ' ' + df_train['species']  + ' ' + df_train['ability_1'] + ' ' + df_train['ability_2'] + ' ' + df_train['ability_hidden']
df_test['combined'] = df_train['name'] + ' ' + df_train['species']  + ' ' + df_train['ability_1'] + ' ' + df_train['ability_2'] + ' ' + df_train['ability_hidden']
df_dev['combined'] = df_train['name'] + ' ' + df_train['species']  + ' ' + df_train['ability_1'] + ' ' + df_train['ability_2'] + ' ' + df_train['ability_hidden']

In [30]:
texts_train = list(df_train['combined'])
images_train = [DATA_PATH_POKEMON_SECONDARY + img for img in list(df_train['Image Path'])]
texts_test = list(df_test['combined'])
images_test = [DATA_PATH_POKEMON_SECONDARY + img for img in list(df_test['Image Path'])]
texts_dev = list(df_dev['combined'])
images_dev = [DATA_PATH_POKEMON_SECONDARY + img for img in list(df_test['Image Path'])]

In [31]:
train_data = combine_image_text(texts_train, images_train, tab_features_train, clip_model, clip_preprocess)
test_data = combine_image_text(texts_test, images_test, tab_features_test, clip_model, clip_preprocess)
dev_data = combine_image_text(texts_dev, images_dev, tab_features_dev, clip_model, clip_preprocess)

100%|████████████████████████████████████████████████████████████████████████████████| 719/719 [01:07<00:00, 10.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 133/133 [00:11<00:00, 11.17it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [00:03<00:00, 11.97it/s]


In [32]:
# initialized model
input_size = train_data[0].size(1)
hidden_size = 128
num_epochs = 30
num_classes = len(le.classes_)
lr = 0.00003
batch_size = 8
    
model = Model(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [33]:
# initialize data loader
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
dev_dataset = CustomDataset(dev_data, dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [34]:
# train model and evaluation
train(model, train_loader, dev_loader)
evaluate_accuracy(model, test_loader)
evaluate_log_loss(model, test_loader, np.unique(test_labels))

Epoch [1/30]
  Train Loss: 3.1404 - Validation Loss: 3.0287
Epoch [2/30]
  Train Loss: 2.9921 - Validation Loss: 2.9797
Epoch [3/30]
  Train Loss: 2.9318 - Validation Loss: 2.9519
Epoch [4/30]
  Train Loss: 2.8017 - Validation Loss: 2.8913
Epoch [5/30]
  Train Loss: 2.7166 - Validation Loss: 2.8460
Epoch [6/30]
  Train Loss: 2.6257 - Validation Loss: 2.8328
Epoch [7/30]
  Train Loss: 2.5312 - Validation Loss: 2.7861
Epoch [8/30]
  Train Loss: 2.4804 - Validation Loss: 2.7527
Epoch [9/30]
  Train Loss: 2.3750 - Validation Loss: 2.7347
Epoch [10/30]
  Train Loss: 2.3077 - Validation Loss: 2.6979
Epoch [11/30]
  Train Loss: 2.2343 - Validation Loss: 2.7142
Epoch [12/30]
  Train Loss: 2.1562 - Validation Loss: 2.6674
Epoch [13/30]
  Train Loss: 2.0867 - Validation Loss: 2.6609
Epoch [14/30]
  Train Loss: 2.0472 - Validation Loss: 2.6344
Epoch [15/30]
  Train Loss: 1.9929 - Validation Loss: 2.6308
Epoch [16/30]
  Train Loss: 1.9688 - Validation Loss: 2.6274
Epoch [17/30]
  Train Loss: 1.882

# Race

In [89]:
# load data
df_train = pd.read_csv(DATA_PATH_HEARTHSTONE_RACE + "/train.csv")
df_test = pd.read_csv(DATA_PATH_HEARTHSTONE_RACE + "/test.csv")
df_dev = pd.read_csv(DATA_PATH_HEARTHSTONE_RACE + "/dev.csv")

In [90]:
tab_data_train = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'set', 'collectible']].copy()
tab_data_test = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'set', 'collectible']].copy()
tab_data_dev = df_train[['cardClass', 'health', 'attack', 'cost', 'rarity', 'set', 'collectible']].copy()

In [93]:
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(df_train['race'])
test_labels = label_encoder.transform(df_test['race'])
dev_labels = label_encoder.transform(df_dev['race'])

In [94]:
tab_features_train = get_tab_features(tab_data_train, train_labels)
tab_features_test = get_tab_features(tab_data_test, train_labels)
tab_features_dev = get_tab_features(tab_data_dev, train_labels)

In [97]:
train_data = process_df(df_train, DATA_PATH_HEARTHSTONE_RACE, tab_features_train, clip_model, clip_preprocess, True)
test_data = process_df(df_test, DATA_PATH_HEARTHSTONE_RACE, tab_features_test, clip_model, clip_preprocess, True)
dev_data = process_df(df_dev, DATA_PATH_HEARTHSTONE_RACE, tab_features_dev, clip_model, clip_preprocess, True)

100%|██████████████████████████████████████████████████████████████████████████████| 5398/5398 [07:18<00:00, 12.30it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1012/1012 [01:22<00:00, 12.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 337/337 [00:27<00:00, 12.32it/s]


In [98]:
# initialized model
input_size = train_data[0].size(1)
hidden_size = 128
num_epochs = 20
num_classes = len(label_encoder.classes_)
lr = 0.0002
batch_size = 16
    
model = Model(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [99]:
# initialize data loader
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
dev_dataset = CustomDataset(dev_data, dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [100]:
# train model and evaluation
train(model, train_loader, dev_loader)
evaluate_accuracy(model, test_loader)
evaluate_log_loss(model, test_loader, np.unique(test_labels))

Epoch [1/20]
  Train Loss: 1.7962 - Validation Loss: 1.4458
Epoch [2/20]
  Train Loss: 1.1733 - Validation Loss: 1.1246
Epoch [3/20]
  Train Loss: 0.9680 - Validation Loss: 0.9347
Epoch [4/20]
  Train Loss: 0.8458 - Validation Loss: 0.8620
Epoch [5/20]
  Train Loss: 0.7260 - Validation Loss: 0.7612
Epoch [6/20]
  Train Loss: 0.6425 - Validation Loss: 0.7364
Epoch [7/20]
  Train Loss: 0.5781 - Validation Loss: 0.6983
Epoch [8/20]
  Train Loss: 0.5305 - Validation Loss: 0.6417
Epoch [9/20]
  Train Loss: 0.4793 - Validation Loss: 0.6202
Epoch [10/20]
  Train Loss: 0.4579 - Validation Loss: 0.6092
Epoch [11/20]
  Train Loss: 0.4144 - Validation Loss: 0.5714
Epoch [12/20]
  Train Loss: 0.3907 - Validation Loss: 0.5621
Epoch [13/20]
  Train Loss: 0.3579 - Validation Loss: 0.5595
Epoch [14/20]
  Train Loss: 0.3387 - Validation Loss: 0.5740
Epoch [15/20]
  Train Loss: 0.3173 - Validation Loss: 0.5794
Epoch [16/20]
  Train Loss: 0.2902 - Validation Loss: 0.5752
Epoch [17/20]
  Train Loss: 0.279



0.4686131321889779

# Card Class

In [68]:
df_train = pd.read_csv(DATA_PATH_HEARTHSTONE_CARDCLASS + "/train.csv")
df_test = pd.read_csv(DATA_PATH_HEARTHSTONE_CARDCLASS + "/test.csv")
df_dev = pd.read_csv(DATA_PATH_HEARTHSTONE_CARDCLASS + "/dev.csv")

In [69]:
label_encoder = LabelEncoder()

In [70]:
train_data = process_df(df_train, DATA_PATH_HEARTHSTONE_CARDCLASS, clip_model, clip_preprocess, True)
train_labels = label_encoder.fit_transform(df_train['cardClass'])
test_data = process_df(df_test, DATA_PATH_HEARTHSTONE_CARDCLASS, clip_model, clip_preprocess, True)
test_labels = label_encoder.fit_transform(df_test['cardClass'])
dev_data = process_df(df_dev, DATA_PATH_HEARTHSTONE_CARDCLASS, clip_model, clip_preprocess, True)
dev_labels = label_encoder.fit_transform(df_dev['cardClass'])

100%|██████████████████████████████████████████████████████████████████████████████| 8561/8561 [08:51<00:00, 16.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1603/1603 [01:34<00:00, 16.96it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 536/536 [00:32<00:00, 16.51it/s]


In [71]:
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
class_mapping

{'DEATHKNIGHT': 0,
 'DEMONHUNTER': 1,
 'DRUID': 2,
 'HUNTER': 3,
 'MAGE': 4,
 'NEUTRAL': 5,
 'NONE_cardClass': 6,
 'PALADIN': 7,
 'PRIEST': 8,
 'ROGUE': 9,
 'SHAMAN': 10,
 'WARLOCK': 11,
 'WARRIOR': 12}

In [72]:
# initialized model
input_size = train_data[0].size(1)
hidden_size = 128
num_epochs = 20
num_classes = len(label_encoder.classes_)
lr = 0.0002
batch_size = 16
    
model = Model(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [73]:
# initialize data loader
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
dev_dataset = CustomDataset(dev_data, dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [74]:
# train model and evaluation
train(model, train_loader, dev_loader)
evaluate_accuracy(model, test_loader)
evaluate_log_loss(model, test_loader)

Epoch [1/20]
  Train Loss: 1.9485 - Validation Loss: 1.7714
Epoch [2/20]
  Train Loss: 1.7056 - Validation Loss: 1.5524
Epoch [3/20]
  Train Loss: 1.5532 - Validation Loss: 1.4228
Epoch [4/20]
  Train Loss: 1.4379 - Validation Loss: 1.3060
Epoch [5/20]
  Train Loss: 1.3421 - Validation Loss: 1.2318
Epoch [6/20]
  Train Loss: 1.2709 - Validation Loss: 1.1834
Epoch [7/20]
  Train Loss: 1.2020 - Validation Loss: 1.1418
Epoch [8/20]
  Train Loss: 1.1479 - Validation Loss: 1.1007
Epoch [9/20]
  Train Loss: 1.1073 - Validation Loss: 1.0693
Epoch [10/20]
  Train Loss: 1.0538 - Validation Loss: 1.0493
Epoch [11/20]
  Train Loss: 1.0113 - Validation Loss: 1.0362
Epoch [12/20]
  Train Loss: 0.9733 - Validation Loss: 1.0018
Epoch [13/20]
  Train Loss: 0.9300 - Validation Loss: 0.9919
Epoch [14/20]
  Train Loss: 0.8939 - Validation Loss: 0.9684
Epoch [15/20]
  Train Loss: 0.8541 - Validation Loss: 0.9799
Epoch [16/20]
  Train Loss: 0.8358 - Validation Loss: 0.9913
Epoch [17/20]
  Train Loss: 0.807

# Spellschool

In [75]:
df_train = pd.read_csv(DATA_PATH_HEARTHSTONE_SPELLSCHOOL + "/train.csv")
df_test = pd.read_csv(DATA_PATH_HEARTHSTONE_SPELLSCHOOL + "/test.csv")
df_dev = pd.read_csv(DATA_PATH_HEARTHSTONE_SPELLSCHOOL + "/dev.csv")

In [76]:
label_encoder = LabelEncoder()

In [77]:
train_data = process_df(df_train, DATA_PATH_HEARTHSTONE_SPELLSCHOOL, clip_model, clip_preprocess, True)
train_labels = label_encoder.fit_transform(df_train['spellSchool'])
test_data = process_df(df_test, DATA_PATH_HEARTHSTONE_SPELLSCHOOL, clip_model, clip_preprocess, True)
test_labels = label_encoder.fit_transform(df_test['spellSchool'])
dev_data = process_df(df_dev, DATA_PATH_HEARTHSTONE_SPELLSCHOOL, clip_model, clip_preprocess, True)
dev_labels = label_encoder.fit_transform(df_dev['spellSchool'])

100%|██████████████████████████████████████████████████████████████████████████████| 2715/2715 [02:43<00:00, 16.65it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 508/508 [00:30<00:00, 16.70it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 170/170 [00:10<00:00, 16.39it/s]


In [78]:
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
class_mapping

{'ARCANE': 0,
 'FEL': 1,
 'FIRE': 2,
 'FROST': 3,
 'HOLY': 4,
 'NATURE': 5,
 'NONE_spellSchool': 6,
 'SHADOW': 7}

In [327]:
test_labels[:100]

array([29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
       29, 29, 29, 29, 29, 29,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 30, 30, 30, 30, 30,
       30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30])

In [79]:
# initialized model
input_size = train_data[0].size(1)
hidden_size = 128
num_epochs = 20
num_classes = len(label_encoder.classes_)
lr = 0.0002
batch_size = 16
    
model = Model(input_size, hidden_size, num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [80]:
# initialize data loader
train_dataset = CustomDataset(train_data, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_data, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
    
dev_dataset = CustomDataset(dev_data, dev_labels)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

In [81]:
# train model and evaluation
train(model, train_loader, dev_loader)
evaluate_accuracy(model, test_loader)
evaluate_log_loss(model, test_loader)

Epoch [1/20]
  Train Loss: 1.3241 - Validation Loss: 1.1467
Epoch [2/20]
  Train Loss: 1.1155 - Validation Loss: 1.0641
Epoch [3/20]
  Train Loss: 1.0132 - Validation Loss: 0.9729
Epoch [4/20]
  Train Loss: 0.9169 - Validation Loss: 0.8972
Epoch [5/20]
  Train Loss: 0.8298 - Validation Loss: 0.8168
Epoch [6/20]
  Train Loss: 0.7530 - Validation Loss: 0.8205
Epoch [7/20]
  Train Loss: 0.6885 - Validation Loss: 0.7211
Epoch [8/20]
  Train Loss: 0.6185 - Validation Loss: 0.6820
Epoch [9/20]
  Train Loss: 0.5871 - Validation Loss: 0.6805
Epoch [10/20]
  Train Loss: 0.5267 - Validation Loss: 0.6220
Epoch [11/20]
  Train Loss: 0.4939 - Validation Loss: 0.6016
Epoch [12/20]
  Train Loss: 0.4635 - Validation Loss: 0.5671
Epoch [13/20]
  Train Loss: 0.4389 - Validation Loss: 0.5519
Epoch [14/20]
  Train Loss: 0.4091 - Validation Loss: 0.5563
Epoch [15/20]
  Train Loss: 0.3669 - Validation Loss: 0.5244
Epoch [16/20]
  Train Loss: 0.3503 - Validation Loss: 0.5503
Epoch [17/20]
  Train Loss: 0.332

# autogluon

In [72]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [77]:
train = TabularDataset(f'{DATA_PATH_HEARTHSTONE_RACE}/train.csv')
label = 'race'

In [78]:
predictor = TabularPredictor(label=label).fit(train)

No path specified. Models will be saved in: "AutogluonModels\ag-20240429_150521"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240429_150521"
AutoGluon Version:  1.1.0
Python Version:     3.11.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Memory Avail:       22.86 GB / 31.95 GB (71.5%)
Disk Space Avail:   226

In [79]:
y_pred = predictor.predict(df_test.drop(columns=[label]))

In [80]:
predictor.evaluate(df_test, silent=True)

{'accuracy': 0.7766798418972332,
 'balanced_accuracy': 0.6358084823002375,
 'mcc': 0.6556862418092518}