In [1]:
import os
import sys
import random
import datetime
import itertools
import numpy as np
import pandas as pd
from zipUtil import zip_write, zip_read
from collections import defaultdict

In [2]:
from dataset_config import decode_type, PROTOSS_ACTIONS, PROTOSS_ACTIONS_TYPE, TERRAN_ACTIONS, TERRAN_ACTIONS_TYPE, ZERG_ACTIONS, ZERG_ACTIONS_TYPE
ACTIONS = {'P': PROTOSS_ACTIONS, 'T': TERRAN_ACTIONS, 'Z': ZERG_ACTIONS}
rev_ACTIONS = {race: {v:k for k,v in ACTIONS[race].items()} for race in 'PTZ'}
ACTIONS_TYPE = {'P': PROTOSS_ACTIONS_TYPE, 'T': TERRAN_ACTIONS_TYPE, 'Z': ZERG_ACTIONS_TYPE}
GATHERER_NAMES = {'P': 'Probe', 'T': 'SCV', 'Z': 'Drone'}

In [3]:
# read the dataset dictionaries
con_dfs = zip_read('dataframes_continuous')
dis_dfs = zip_read('dataframes_discrete')
target_dfs = zip_read('dataframes_target')

## Dataset Cleanup

There are a few 0-valued constant columns we can simply drop (no info) as well as a sanity check for 'incorrect' labels. Finally, some erronously 'continuous' features are moved to the discrete dataset.

In [4]:
# Dataset cleanup
for race in "PTZ":
    # Clean up some useless columns
    try:
        con_dfs[race] = con_dfs[race].drop(columns=['vespene_queued_economic','vespene_total_economic','vespene_value_current_economic'])
    except:
        pass
    # Clean up incorrectly labeled columns
    incorrect_labels = target_dfs[race][~target_dfs[race]['Target'].isin(rev_ACTIONS[race])].index.tolist()
    if (len(incorrect_labels) > 0):
        print(f"{race} dropping {len(incorrect_labels)} rows due to incorrect labelling")
        print(target_dfs[race]['Target'][incorrect_labels])
        con_dfs[race] = con_dfs[race].drop(incorrect_labels)
        dis_dfs[race] = dis_dfs[race].drop(incorrect_labels)
        target_dfs[race] = target_dfs[race].drop(incorrect_labels)
    # Move ['Timestamp','supply_available','supply_consumed'] fields from continuous dataset into discrete dataset
    discrete_features = ['Timestamp','supply_available','supply_consumed','workers_active']
    dis_dfs[race] = pd.concat([dis_dfs[race],con_dfs[race][discrete_features]],axis=1)
    con_dfs[race] = con_dfs[race].drop(columns=discrete_features)

## Side-Task: Building Predictions ONLY

As a large portion of the labels are made up of Worker units and Army units, I wanted to poke around with a limited dataset focused on next BUILDING predictions only.

**Setting the `BUILDINGS_ONLY` flag to True and re-running the notebook results in predictions on the limited BUILDINGS only label subset of our dataset**

In [5]:
# Remove Worker and Army info from dataset
BUILDINGS_ONLY = False
if (BUILDINGS_ONLY):
    for race in "PTZ":
        army_worker_bits = set([0b010000,0b000100])
        army_worker_ids = set()
        for key,name in rev_ACTIONS[race].items():
            if (ACTIONS_TYPE[race][name] in army_worker_bits):
                army_worker_ids.add(key)
        army_worker_rows = target_dfs[race][target_dfs[race]['Target'].isin(army_worker_ids)].index.tolist()
        if (len(army_worker_rows) > 0):
            print(f"{race} dropping {len(army_worker_rows)} rows describing Worker or Army_Units")
            con_dfs[race] = con_dfs[race].drop(army_worker_rows)
            dis_dfs[race] = dis_dfs[race].drop(army_worker_rows)
            target_dfs[race] = target_dfs[race].drop(army_worker_rows)
        # Drop army features columns from dis_dfs
        dropped_columns = []
        for name in ACTIONS[race].keys():
            if (ACTIONS_TYPE[race][name] in army_worker_bits):
                dropped_columns.append(name)
        dis_dfs[race] = dis_dfs[race].drop(columns=dropped_columns)

## Imports and Helper Functions

`map_to_action_type`: remaps labels from fine-grained to coarse-grained.

`mix_naive_bayes`: helper function to multiplicatively mix prediction probabilities as per independence assumption from different modalities.

`score_individual`: helper function to examine performance on a label-by-label basis rather than aggregate

`top_k_score`: helper function to use model to make a top-k prediction for matching against the labels instead of top-1 (default)

`dataset_processing`: helper function to split dataset into Training/Testing data (80/20)

In [6]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KernelDensity
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic

In [7]:
# Remap fine-grained action labels into broader action-class labels (as defined in dataset_config.py)
# 0b[UABETD] 6-bit encoding
# U: is_upgrade?
# A: is_army?
# B: is_building?
# E: for_economy?
# T: for_tech?
# D: for_defense? (static defenses only)
def map_to_action_type(Y_labels,race):
    return np.array([ACTIONS_TYPE[race][rev_ACTIONS[race][lbl]] for lbl in Y_labels])
    # Truncated bits
    #return np.array([ACTIONS_TYPE[race][rev_ACTIONS[race][lbl]]&(7<<3) for lbl in Y_labels])

In [8]:
# Dual-modality NB mixture model
class MixedNB(BaseEstimator, ClassifierMixin):
    def __init__(self, clf_con, clf_dis):
        self.clf_con = clf_con
        self.clf_dis = clf_dis
        
    def fit(self, X1, X2, y):
        self.clf_con.fit(X1,y)
        self.clf_dis.fit(X2,y)
        self.classes_ = self.clf_con.classes_
    
    def predict_proba(self, X1, X2):
        return self.clf_con.predict_proba(X1)*self.clf_dis.predict_proba(X2)
        
    def predict(self, X1, X2):
        return self.classes_[np.argmax(self.predict_proba(X1,X2),1)]
        
    def score(self, X1, X2, y):
        return np.mean(self.predict(X1,X2) == y)

In [9]:
#https://jakevdp.github.io/PythonDataScienceHandbook/05.13-kernel-density-estimation.html#Example:-Not-So-Naive-Bayes
class KDEClassifier(BaseEstimator, ClassifierMixin):
    """Bayesian generative classification based on KDE
    
    Parameters
    ----------
    bandwidth : float
        the kernel bandwidth within each class
    kernel : str
        the kernel name, passed to KernelDensity
    """
    def __init__(self, bandwidth=1.0, kernel='gaussian'):
        self.bandwidth = bandwidth
        self.kernel = kernel
        
    def fit(self, X, y):
        self.classes_ = np.sort(np.unique(y))
        training_sets = [X[y == yi] for yi in self.classes_]
        self.models_ = [KernelDensity(bandwidth=self.bandwidth,
                                      kernel=self.kernel).fit(Xi)
                        for Xi in training_sets]
        self.logpriors_ = [np.log(Xi.shape[0] / X.shape[0])
                           for Xi in training_sets]
        return self
        
    def predict_proba(self, X):
        logprobs = np.array([model.score_samples(X)
                             for model in self.models_]).T
        result = np.exp(logprobs + self.logpriors_)
        return result / result.sum(1, keepdims=True)
        
    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), 1)]

In [10]:
# Helper function to independently score each label (to see which labels we do poorly on)
# when passing scored matches array, set `match_literal` to True
def score_individual(p_x,y,mapping=None,match_literal=False):
    matches = np.array(p_x == y) if not match_literal else p_x
    classes = list(set(y))
    indiv_scores = []
    for c in classes:
        idx = np.where(y == c)
        score = np.mean(matches[idx]) # accuracy
        num_matches = np.sum(matches[idx]) # total correct prediction count
        p_accuracy = None
        if (not match_literal):
            num_label_predictions = len(np.where(p_x == c)[0]) # total predictions made with this label
            p_accuracy = num_matches/num_label_predictions if num_label_predictions > 0 else None # what % accuracy when we actually predict this label
        c_size = len(idx[0]) # number of such labels in y
        c_magnitude = c_size/len(y) # proportion of label class c in y
        indiv_scores.append((mapping(c) if mapping is not None else c,score,c_size,c_magnitude,p_accuracy))
    # Column Labels
    indiv_cols = ['Label','Accuracy','Label_Frequency','Percentage_Frequency','Label_Prediction_Accuracy']
    # Data per column sorted by decreasing accuracy
    indiv_scores = sorted(indiv_scores,key=lambda x: x[1],reverse=True)
    overall_score = np.mean(matches)
    return indiv_scores,indiv_cols,overall_score

In [11]:
# Top-k prediction accuracy
# x: given as [] array for compatibility with MixedNB
def top_k_score(clf,x,y,k=3):
    prediction_proba = clf.predict_proba(*x)
    top_k_labels = []
    top_label = []
    # Sort and select top k labels
    for row in prediction_proba:
        sorted_proba = sorted([(s,i) for i,s in enumerate(row)],reverse=True)[:k]
        top_k_labels.append(set([clf.classes_[i] for s,i in sorted_proba]))
        top_label.append(clf.classes_[sorted_proba[0][1]])
    # Score by membership
    matches = np.zeros(len(y))
    for i in range(len(y)):
        if (y[i] in top_k_labels[i]):
            matches[i] = 1
    score = np.mean(matches)
    return score, matches, top_label

In [12]:
# Process dataset into Train/Test 80/20 split
def dataset_processing(con_dfs, dis_dfs, target_dfs, race, con_features=None, dis_features=None, normalize=True):
    # Selected features only
    if (con_features is not None):
        con_dfs[race] = con_dfs[race][con_features]
    if (dis_features is not None):
        dis_dfs[race] = dis_dfs[race][dis_features]

#     # Full dataset
#     X_con = con_dfs[race]
#     X_dis = dis_dfs[race]
    Y = target_dfs[race]['Target'].to_numpy()

    # Combining the two modalities
    full_index = target_dfs[race].index.tolist()

    # Shuffle into random order
    random.seed(0)
    random.shuffle(full_index)

    # Cut into 75/5/20 split (Training,Validation,Test)
    cutoff_train = int(0.75*len(full_index))
    cutoff_valid = int(0.80*len(full_index))
    train_index = full_index[:cutoff_train]
    valid_index = full_index[cutoff_train:cutoff_valid]
    test_index = full_index[cutoff_valid:]

    # Individuated Continuous/Discrete Features
    X_train_con, X_train_dis = con_dfs[race].loc[train_index], dis_dfs[race].loc[train_index]
    X_valid_con, X_valid_dis = con_dfs[race].loc[valid_index], dis_dfs[race].loc[valid_index]
    X_test_con, X_test_dis = con_dfs[race].loc[test_index], dis_dfs[race].loc[test_index]
    # Aggregated Features
    X_train_all = pd.concat([X_train_con,X_train_dis],axis=1)
    X_valid_all = pd.concat([X_valid_con,X_valid_dis],axis=1)
    X_test_all = pd.concat([X_test_con,X_test_dis],axis=1)

    # Normalize features
    if (normalize):
        con_scaler = MinMaxScaler()
        X_train_con = pd.DataFrame(con_scaler.fit_transform(X_train_con),columns=list(X_train_con))
        X_valid_con = pd.DataFrame(con_scaler.transform(X_valid_con),columns=list(X_valid_con))
        X_test_con = pd.DataFrame(con_scaler.transform(X_test_con),columns=list(X_test_con))
#         dis_scaler = MinMaxScaler()
#         X_train_dis = dis_scaler.fit_transform(X_train_dis)
#         X_test_dis = dis_scaler.transform(X_test_dis)

    # Transform target labels into numpy array
    Y_train = target_dfs[race].loc[train_index]
    Y_valid = target_dfs[race].loc[valid_index]
    Y_test = target_dfs[race].loc[test_index]
    Y_train = Y_train['Target'].to_numpy()
    Y_valid = Y_valid['Target'].to_numpy()
    Y_test = Y_test['Target'].to_numpy()

    # Remap to coarse label classes
    Y_actions = map_to_action_type(Y,race)
    Y_train_types = map_to_action_type(Y_train,race)
    Y_valid_types = map_to_action_type(Y_valid,race)
    Y_test_types = map_to_action_type(Y_test,race)
    
    X_con = (X_train_con,X_valid_con,X_test_con)
    X_dis = (X_train_dis,X_valid_dis,X_test_dis)
    X_all = (X_train_all,X_valid_all,X_test_all)
    Y_fine = (Y_train,Y_valid,Y_test)
    Y_types = (Y_train_types,Y_valid_types,Y_test_types)
    
    fine_frequencies = sorted([np.sum(Y==yi)/len(Y) for yi in np.unique(Y)],reverse=True)
    print(fine_frequencies[0],sum(fine_frequencies[:3]))
    coarse_frequencies = sorted([np.sum(Y_actions==yi)/len(Y_actions) for yi in np.unique(Y_actions)],reverse=True)
    print(coarse_frequencies[0],sum(coarse_frequencies[:3]))
    
    return X_con, X_dis, X_all, Y_actions, Y_fine, Y_types

## Data Summary (Run after all 3 races cells)

Aggregate results into a json object for further visualization & analysis.

In [13]:
# Aggregate Summary Information
summary = {
    race: {
        # Coarse | Fine
        "Coarse": {
            "baseline": {},
            # Gaussian | Multinomial | Complement
            "continuous": {},
            # Gaussian | Multinomial | Complement
            "discrete": {},
            # Multinomial | Complement
            "all": {},
            # Gaussian & Complement | Multinomial & Multinomial
            "mixture": {},
        },
        "Fine": {
            "baseline": {},
            # Gaussian | Multinomial | Complement
            "continuous": {},
            # Gaussian | Multinomial | Complement
            "discrete": {},
            # Multinomial | Complement
            "all": {},
            # Gaussian & Complement | Multinomial & Multinomial
            "mixture": {},
        }
    } \
    for race in "PTZ"
}

# keep a copy of the data for replicability
data = {
    race: {} for race in "PTZ"
}

In [14]:
import json
# print(json.dumps(
#     summary,
#     indent=4,
#     separators=(',', ': ')
# ))
with open('naive_bayes.json', 'w+') as fout:
    fout.write(json.dumps(summary,indent=4,separators=(',',': ')))
    
zip_write('naive_bayes_dataset',data)

# Naive Bayes Approach

Below, I explore the Naive Bayes approach with the `sklearn.naive_bayes` package, focusing on using three likelihoods, Gaussian, Multinomial, and 'Complement'. The labels in our dataset are further broken down to Coarse/Fine-grained labels as described in the `dataset_config.py` file.

We first run ANOVA analysis on the large set of features to identify pertinent ones, select the top k (variable) before running classification. Note that 'Timestamp', 'supply_available', and 'supply_consumed' are in fact integer-valued and moved from the continuous feature columns to the discrete feature columns.

Furthermore, we also benchmark against Random Forest Classifier and AdaBoost Classifier.

Comparison is finally made between Naive_Bayes (continuous, discrete, mixture) and KDE. KDE outperforms all other models including both the Random Forest Classifier and AdaBoost Classifier.

In [15]:
# Control Variables
race = 'P' # Change to run for a different race
k_con_features = 10
k_dis_features = 30

In [16]:
print("===== CONTINUOUS =====")
X, y = con_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(con_dfs[race])
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_con_features]:
    print(f"{name}: {score:.3f}")
selected_con_features = [name for name,score in feature_scores[:k_con_features]]
print("===== DISCRETE =====")
X, y = dis_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(X)
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_dis_features]:
    print(f"{name}: {score:.3f}")
selected_dis_features = [name for name,score in feature_scores[:k_dis_features]]

===== CONTINUOUS =====
mineral_value_current_technology: 777.047
mineral_total_technology: 776.617
mineral_spend: 767.711
mineral_value_current_economic: 734.693
mineral_total_economic: 705.477
mineral_value_current_army: 688.730
mineral_total_army: 687.209
mineral_collection_rate: 639.224
worker_supply_ratio: 626.728
vespene_spend: 624.743
===== DISCRETE =====
Gateway: 864.500
Pylon: 817.839
Timestamp: 785.746
supply_available: 773.011
supply_consumed: 704.380
Charge: 651.725
workers_active: 648.809
Probe: 635.693
Assimilator: 606.976
TemplarArchive: 598.791
Zealot: 534.771
RoboticsFacility: 522.920
Nexus: 513.238
Forge: 449.942
ProtossGroundWeaponsLevel1: 446.409
TwilightCouncil: 423.232
HighTemplar: 401.186
Warp Gate: 386.556
ShieldBattery: 340.888
Stalker: 339.126
WarpPrism: 330.464
Archon: 321.299
Observer: 308.140
Sentry: 299.211
CyberneticsCore: 297.268
RoboticsBay: 276.463
Immortal: 276.271
Blink: 273.105
PhotonCannon: 270.274
ProtossGroundWeaponsLevel2: 251.672


In [17]:
# Grab Train/Test data split
X_con, X_dis, X_all, Y_actions, Y, Y_types = dataset_processing(con_dfs, dis_dfs, target_dfs, race, con_features=selected_con_features, dis_features=selected_dis_features)
X_train_con, X_valid_con, X_test_con = X_con
X_train_dis, X_valid_dis, X_test_dis = X_dis
X_train_all, X_valid_all, X_test_all = X_all
Y_train, Y_valid, Y_test = Y
Y_train_types, Y_valid_types, Y_test_types = Y_types

# Keep a copy of the data for replicability
data[race]["X_train_con"], data[race]["X_valid_con"], data[race]["X_test_con"] = X_con
data[race]["X_train_dis"], data[race]["X_valid_dis"], data[race]["X_test_dis"] = X_dis
data[race]["X_train_all"], data[race]["X_valid_all"], data[race]["X_test_all"] = X_all
data[race]["Y_train"], data[race]["Y_valid"], data[race]["Y_test"] = Y
data[race]["Y_train_types"], data[race]["Y_valid_types"], data[race]["Y_test_types"] = Y

0.39938894344750964 0.5864059756569354
0.39938894344750964 0.8377228223670505


In [18]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_actions:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_actions),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

000100
	Worker
	0.399, 40262
010000
	Army
	0.276, 27789
001100
	Economy
	Building
	0.163, 16399
011000
	Building
	Army
	0.063, 6316
001001
	Static Defense
	Building
	0.041, 4091
001010
	Technology
	Building
	0.035, 3513
100000
	Upgrade
	0.024, 2439


In [19]:
# Kernel Density Estimate
kde_sample = X_train_con.loc[random.sample(X_train_con.index.tolist(),1000)]
for k_name in ['gaussian','tophat','exponential','linear']:
    kde = KernelDensity(kernel=k_name, bandwidth=0.3).fit(kde_sample)
    print(k_name, kde.score(kde_sample))

gaussian 1380.8487709370697
tophat 9281.851080421697
exponential -5716.98157483753
linear 10866.124028567112


In [20]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [defaultdict(list),defaultdict(list)]
for i in range(2,3):
    top_k = 3
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[0]["Score"].append(coarse_score)
    rand_forest_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[1]["Score"].append(fine_score)
    rand_forest_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = rand_forest_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = rand_forest_perf[1]

Random Forest Classification
max_depth: 2
Coarse: 0.49895843666302947
	Top-3: 0.8488245213768475
Fine: 0.4115167146116457
	Top-3: 0.6260291637734352


In [21]:
# AdaBoost Classifier
print("AdaBoost Classification")
ada_boost_perf = [defaultdict(list),defaultdict(list)]
for n in range(50,51,50):
    top_k = 3
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[0]["Score"].append(coarse_score)
    ada_boost_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[1]["Score"].append(fine_score)
    ada_boost_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = ada_boost_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = ada_boost_perf[1]

AdaBoost Classification
n_estimators: 50
Coarse: 0.45620474159309593
	Top-3: 0.8589921634758456
Fine: 0.25369506993353835
	Top-3: 0.575339748040869


## GPC Analysis

Basic grid search over hyperparameters for RBF, Matern, RationalQuadratic does not turn up good results, neither are they that different between each other. Having a top-1 accuracy aking to the most frequent and top-3 accuracy 3-most frequent... (naive guessing results).

Problem appears to be the lack of good strategy for finding an informative inducing set to condition the GPC upon.

In [22]:
# Uniformly Sample Inducing Set from each possible labels with some budget
# budget = 300
# inducing_set = []

# y_labels = np.unique(Y_train)
# y = np.array(Y_train)
# indexes = {yi: np.where(y == yi)[0].tolist() for yi in y_labels}

# # Evenly distribute budget into each label
# each = {yi: min(len(indexes[yi]),int(budget/len(y_labels))) for yi in y_labels}
# print(each)

# # Round-Robin remainder
# cur = int(budget/len(y_labels))*len(y_labels)
# random.shuffle(y_labels)
# l = 0
# while (cur < budget):
#     lbl = y_labels[l]
#     l = (l+1)%len(y_labels)
#     if (each[lbl] < len(indexes[lbl])):
#         each[lbl] += 1
#         cur += 1
# print(each)

# for yi, num in each.items():
#     inducing_set.extend(random.sample(indexes[yi],num))
#     # Verify inducing set extraction accuracy
#     verify=True
#     for i in indexes[yi]:
#         verify &= (Y_train[i] == yi)
#     if (not verify):
#         print(f"Error on label: {yi}")

In [23]:
# GP Classification with RBF, Matern kernels
inducing_set = random.sample(range(len(Y_train)), 300)
top_k_model = None
top_k_config = 0
# kernel_params = {
#     "RBF_l": np.linspace(1,20,10),
#     "Matern_nu": [1.5,2.5],
#     "RQ_alpha": np.linspace(0.5,2.5,10),
# }
# for k_name, param_li in kernel_params.items():
#     for p in param_li:
#         if (k_name == "RBF_l"):
#             kernel = RBF(p)
#         elif (k_name == "Matern_nu"):
#             kernel = Matern(nu=p)
#         elif (k_name == "RQ_alpha"):
#             kernel = RationalQuadratic(alpha=p)
#         else:
#             continue
#         gpc = GaussianProcessClassifier(kernel=kernel).fit(X_train_all.iloc[inducing_set],[Y_train[i] for i in inducing_set])
#         print(f"Trained GPC: {kernel}")
#         top_k = 3
#         top_k_s, top_k_matches, top_label = top_k_score(gpc,[X_valid_all.head(1000)],Y_valid[:1000],k=top_k)
#         score = np.mean(top_label == Y_valid[:1000])
#         print(f"{gpc}:",score)
#         print(f"\tTop-{top_k}: {top_k_s}")
#         if (top_k_s > top_k_config):
#             top_k_config = top_k_s
#             top_k_model = gpc
# print(top_k_model)

In [24]:
# No Good results with GPC
# kernel = RationalQuadratic(alpha=2.5, length_scale=5)
# gpc = GaussianProcessClassifier(kernel=kernel).fit(X_train_all.iloc[inducing_set],[Y_train[i] for i in inducing_set])
# print(f"Trained GPC: {kernel}")
# top_k = 3
# top_k_s, top_k_matches, top_label = top_k_score(gpc,[X_valid_all.head(1000)],Y_valid[:1000],k=top_k)
# score = np.mean(top_label == Y_valid[:1000])
# print(f"{gpc}:",score)
# print(f"\tTop-{top_k}: {top_k_s}")
# if (top_k_s > top_k_config):
#     top_k_config = top_k_s
#     top_k_model = gpc

## KDE Naive Bayes

Use Kernel Density Estimates as the likelihood of data given a label. Better results than using combination of Naive Bayes classifiers.

### Coarse label prediction with KDE

In [25]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train_types)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train_types)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [26]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.5010910533624281
	Top-3: 0.8272168220591153


In [27]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_dis: 0.5320372941876612
	Top-3: 0.8585598095615949


In [28]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.4848244395953184
	Top-3: 0.8214639952390399


  return result / result.sum(1, keepdims=True)


In [29]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_exp_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_all: 0.5272763340607023
	Top-3: 0.8504265026780401


### Fine label predictions with KDE

In [30]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [31]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.43126363816703034
	Top-3: 0.6538385241023607


In [32]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_dis:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

KDE_exp_model_dis: 0.4540765721087086
	Top-3: 0.689744098393176


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Probe,0.972387,2028,0.402301,
1,CyberneticsCore,0.876543,81,0.016068,
2,Pylon,0.783362,577,0.114461,
3,Adept,0.731343,134,0.026582,
4,Stalker,0.720513,390,0.077366,
5,Carrier,0.666667,12,0.00238,
6,Zealot,0.660714,280,0.055545,
7,Gateway,0.567901,243,0.048205,
8,AssimilatorRich,0.536765,136,0.026979,
9,Warp Gate,0.434783,46,0.009125,


In [33]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.40527673080737947
	Top-3: 0.623884149970244


  return result / result.sum(1, keepdims=True)


In [34]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['all'][f"{KDE_exp_model_all}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_all:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
# formatted_data #(hide)

KDE_exp_model_all: 0.4395953183892085
	Top-3: 0.660979964292799


## Naive Bayes (Gaussian, Multinomial, Complement)

**Gaussian likelihood** assuming feature-value given label is normally distributed.

**Multinomial likelihood** for discrete and fractional valued features, problematic when dataset label proportion is imbalanced.

**Complement likelihood** which assigns likelihood based on how disimilar the features are for other labels instead, fixing the imbalanced dataset issue for multinomial.

In [35]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Continuous Features

Gaussian
Coarse Predictions: 0.4980656680884833
	Top-3: 0.6974506497371292
Fine Predictions: 0.07598452534470787
	Top-3: 0.3694573950996925

Multinomial
Coarse Predictions: 0.497817676817776
	Top-3: 0.838508084515425
Fine Predictions: 0.3990675528221407
	Top-3: 0.6040075389346296

Complement
Coarse Predictions: 0.4745064973712925
	Top-3: 0.6274179148893959
Fine Predictions: 0.3158416823727805
	Top-3: 0.45724630493006646


In [36]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Discrete Features

Gaussian
Coarse Predictions: 0.47252256720563435
	Top-3: 0.6437853387560758
Fine Predictions: 0.020880864993552226
	Top-3: 0.20712230929471281

Multinomial
Coarse Predictions: 0.4765896240452336
	Top-3: 0.7064775319908739
Fine Predictions: 0.24799127070727112
	Top-3: 0.5001983930165658

Complement
Coarse Predictions: 0.49047713520484076
	Top-3: 0.7073207023112786
Fine Predictions: 0.42054359686539033
	Top-3: 0.5867473464934034


In [37]:
def single_model_all(clf,coarse=False):
    if (coarse):
        train_y = Y_train_types
        test_y = Y_test_types
    else:
        train_y = Y_train
        test_y = Y_test
    clf.fit(X_train_all,train_y)
    score = clf.score(X_test_all,test_y)
    print(f"ALL Model ({clf}):",score)
    top_k = 3
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],test_y,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse" if coarse else "Fine"]["all"][f"{clf}"] = {"Score": score, f"Top-{top_k}": top_k_s}
    
models = [lambda: GaussianNB(),lambda: MultinomialNB(),lambda: ComplementNB()]

print("Coarse Predictions:")
top_coarse_mixture = None
top_coarse_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train_types)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test_types,k=top_k)
        score = np.mean(top_label == Y_test_types)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_coarse_k_score):
            top_coarse_mixture = model
            top_coarse_k_score = top_k_s
        summary[race]["Coarse"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB(),coarse=True)
single_model_all(MultinomialNB(),coarse=True)

print("\nFine Predictions:")
top_fine_mixture = None
top_fine_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test,k=top_k)
        score = np.mean(top_label == Y_test)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_fine_k_score):
            top_fine_mixture = model
            top_fine_k_score = top_k_s
        summary[race]["Fine"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB())
single_model_all(MultinomialNB())

Coarse Predictions:
Mixture Model (GaussianNB() + GaussianNB()): 0.48402936216645176
	Top-3: 0.6597063783354826
Mixture Model (GaussianNB() + MultinomialNB()): 0.4901299474258506
	Top-3: 0.7206626326753298
Mixture Model (GaussianNB() + ComplementNB()): 0.49756968554706876
	Top-3: 0.7010217240353139
Mixture Model (MultinomialNB() + GaussianNB()): 0.4907747247296895
	Top-3: 0.6921436365439937
Mixture Model (MultinomialNB() + MultinomialNB()): 0.4847237377244321
	Top-3: 0.7361372879674636
Mixture Model (MultinomialNB() + ComplementNB()): 0.4903283404424164
	Top-3: 0.7662434282313263
Mixture Model (ComplementNB() + GaussianNB()): 0.4732169427636147
	Top-3: 0.6441821247892074
Mixture Model (ComplementNB() + MultinomialNB()): 0.4773831961114969
	Top-3: 0.706824719769864
Mixture Model (ComplementNB() + ComplementNB()): 0.4914195020335284
	Top-3: 0.7080646761234004
ALL Model (ComplementNB()): 0.4745064973712925
	Top-3: 0.6627814700922527
ALL Model (MultinomialNB()): 0.40992957047911915
	Top-3:

## Visualizing Prediction accuracies per label

Interesting to dig into just what are getting predicted correctly/wrongly and whether it is performing above naive levels of guessing 3-most frequent items

In [38]:
# Individual class prediction accuracies
indiv, indiv_cols, score = score_individual(top_coarse_mixture.predict(X_test_con, X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions {top_coarse_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4903283404424164


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.908614,7999,0.396736,0.468058
1,Army,0.468588,5587,0.277105,0.564955
2,Upgrade,0.0,482,0.023906,
3,"Static Defense,Building",0.0,806,0.039976,
4,"Technology,Building",0.0,693,0.034372,
5,"Economy,Building",0.0,3320,0.164666,
6,"Building,Army",0.0,1275,0.063238,


In [39]:
indiv, indiv_cols, score = score_individual(top_fine_mixture.predict(X_test_con, X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions {top_fine_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4197996230532685


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Probe,0.924991,7999,0.396736,0.459024
1,Zealot,0.589858,1124,0.055748,0.256281
2,Stalker,0.243932,1648,0.081738,0.276669
3,Adept,0.0,546,0.027081,
4,Archon,0.0,160,0.007936,
5,AssimilatorRich,0.0,562,0.027874,
6,Blink,0.0,64,0.003174,
7,Carrier,0.0,42,0.002083,
8,Charge,0.0,51,0.00253,
9,Colossus,0.0,103,0.005109,


In [40]:
top_k = 3
top_k_s, matches, top_label = top_k_score(top_fine_mixture,[X_test_con,X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions {top_fine_mixture} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()) Top-3: 0.6210693383592898


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Pylon,0.962511,2214,0.109811,
1,Probe,0.948994,7999,0.396736,
2,Stalker,0.790049,1648,0.081738,
3,Zealot,0.739324,1124,0.055748,
4,Gateway,0.523759,947,0.04697,
5,HighTemplar,0.484,250,0.0124,
6,Archon,0.30625,160,0.007936,
7,Observer,0.003663,273,0.01354,
8,Adept,0.0,546,0.027081,
9,AssimilatorRich,0.0,562,0.027874,


In [41]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Continuous GaussianNB(): 0.4980656680884833


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.83673,7999,0.396736,0.496219
1,Army,0.599427,5587,0.277105,0.501798
2,Upgrade,0.0,482,0.023906,
3,"Static Defense,Building",0.0,806,0.039976,
4,"Technology,Building",0.0,693,0.034372,
5,"Economy,Building",0.0,3320,0.164666,
6,"Building,Army",0.0,1275,0.063238,


In [42]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Discrete ComplementNB(): 0.49047713520484076


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.909739,7999,0.396736,0.467854
1,Army,0.465187,5587,0.277105,0.566972
2,"Static Defense,Building",0.016129,806,0.039976,0.541667
3,Upgrade,0.0,482,0.023906,
4,"Technology,Building",0.0,693,0.034372,
5,"Economy,Building",0.0,3320,0.164666,
6,"Building,Army",0.0,1275,0.063238,


In [43]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB(): 0.07598452534470787


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Warp Gate,0.909639,166,0.008233,0.066549
1,CyberneticsCore,0.885932,263,0.013044,0.063195
2,Oracle,0.816327,98,0.004861,0.022734
3,ProtossGroundWeaponsLevel2,0.733333,30,0.001488,0.007161
4,ProtossGroundWeaponsLevel1,0.367647,68,0.003373,0.013736
5,Disruptor,0.346457,127,0.006299,0.047059
6,Charge,0.196078,51,0.00253,0.006725
7,ProtossGroundWeaponsLevel3,0.142857,7,0.000347,0.007692
8,Probe,0.11814,7999,0.396736,0.554903
9,HighTemplar,0.036,250,0.0124,0.06338


In [44]:
top_k = 3
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
top_k_s, matches, top_label = top_k_score(clf_con,[X_test_con],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Continuous {clf_con} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB() Top-3: 0.3694573950996925


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Gravitic Boosters,1.0,1,5e-05,
1,Warp Gate,0.951807,166,0.008233,
2,CyberneticsCore,0.939163,263,0.013044,
3,Oracle,0.897959,98,0.004861,
4,Probe,0.780348,7999,0.396736,
5,ProtossGroundWeaponsLevel2,0.766667,30,0.001488,
6,ProtossGroundWeaponsLevel1,0.573529,68,0.003373,
7,Charge,0.490196,51,0.00253,
8,TwilightCouncil,0.475177,141,0.006993,
9,Disruptor,0.433071,127,0.006299,


In [45]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB(): 0.42054359686539033


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Probe,0.897612,7999,0.396736,0.474146
1,Zealot,0.6379,1124,0.055748,0.253715
2,Stalker,0.346481,1648,0.081738,0.271388
3,PhotonCannon,0.031621,253,0.012548,0.153846
4,HighTemplar,0.008,250,0.0124,0.25
5,VoidRay,0.005236,191,0.009473,0.0625
6,Adept,0.0,546,0.027081,0.0
7,Archon,0.0,160,0.007936,
8,AssimilatorRich,0.0,562,0.027874,
9,Blink,0.0,64,0.003174,


In [46]:
top_k = 3
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
top_k_s, matches, top_label = top_k_score(clf_dis,[X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Discrete {clf_dis} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB() Top-3: 0.5867473464934034


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Probe,0.91974,7999,0.396736,
1,Adept,0.915751,546,0.027081,
2,HighTemplar,0.808,250,0.0124,
3,Archon,0.74375,160,0.007936,
4,Zealot,0.733096,1124,0.055748,
5,Gateway,0.639916,947,0.04697,
6,Stalker,0.632888,1648,0.081738,
7,Pylon,0.445348,2214,0.109811,
8,Disruptor,0.385827,127,0.006299,
9,VoidRay,0.303665,191,0.009473,


# TERRAN

In [47]:
# Control Variables
race = 'T' # Change to run for a different race
k_con_features = 10
k_dis_features = 30

In [48]:
print("===== CONTINUOUS =====")
X, y = con_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(con_dfs[race])
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_con_features]:
    print(f"{name}: {score:.3f}")
selected_con_features = [name for name,score in feature_scores[:k_con_features]]
print("===== DISCRETE =====")
X, y = dis_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(X)
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_dis_features]:
    print(f"{name}: {score:.3f}")
selected_dis_features = [name for name,score in feature_scores[:k_dis_features]]

===== CONTINUOUS =====
mineral_value_current_technology: 550.929
mineral_total_technology: 548.076
mineral_spend: 541.893
mineral_total_economic: 528.978
mineral_value_current_economic: 520.374
worker_supply_ratio: 509.133
vespene_spend: 481.874
mineral_value_current_army: 481.724
mineral_total_army: 468.525
vespene_value_current_army: 466.016
===== DISCRETE =====
SupplyDepot: 603.448
Timestamp: 586.358
Barracks: 556.069
supply_available: 541.990
TerranInfantryWeaponsLevel1: 524.355
Combat Shield: 519.359
SCV: 509.206
supply_consumed: 504.091
Stimpack: 487.613
workers_active: 473.471
Refinery: 469.745
Marine: 456.200
Medivac: 449.946
EngineeringBay: 423.579
OrbitalCommand: 403.101
TerranInfantryArmorsLevel1: 373.465
BarracksReactor: 360.421
CommandCenter: 356.678
Starport: 353.804
Marauder: 290.529
StarportReactor: 258.922
SiegeTank: 246.674
BarracksTechLab: 230.386
Factory: 218.535
Armory: 208.333
TerranInfantryWeaponsLevel2: 202.034
Concussive Shells: 201.937
WidowMine: 178.058
Facto

In [49]:
# Grab Train/Test data split
X_con, X_dis, X_all, Y_actions, Y, Y_types = dataset_processing(con_dfs, dis_dfs, target_dfs, race, con_features=selected_con_features, dis_features=selected_dis_features)
X_train_con, X_valid_con, X_test_con = X_con
X_train_dis, X_valid_dis, X_test_dis = X_dis
X_train_all, X_valid_all, X_test_all = X_all
Y_train, Y_valid, Y_test = Y
Y_train_types, Y_valid_types, Y_test_types = Y_types

# Keep a copy of the data for replicability
data[race]["X_train_con"], data[race]["X_valid_con"], data[race]["X_test_con"] = X_con
data[race]["X_train_dis"], data[race]["X_valid_dis"], data[race]["X_test_dis"] = X_dis
data[race]["X_train_all"], data[race]["X_valid_all"], data[race]["X_test_all"] = X_all
data[race]["Y_train"], data[race]["Y_valid"], data[race]["Y_test"] = Y
data[race]["Y_train_types"], data[race]["Y_valid_types"], data[race]["Y_test_types"] = Y

0.351846452866861 0.6439504373177842
0.3686953352769679 0.877089407191448


In [50]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_actions:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_actions),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

010000
	Army
	0.369, 30351
000100
	Worker
	0.352, 28964
001100
	Economy
	Building
	0.157, 12887
011000
	Building
	Army
	0.042, 3490
011010
	Technology
	Building
	Army
	0.033, 2696
100000
	Upgrade
	0.019, 1569
001001
	Static Defense
	Building
	0.017, 1416
001010
	Technology
	Building
	0.012, 947


In [51]:
# Kernel Density Estimate
kde_sample = X_train_con.loc[random.sample(X_train_con.index.tolist(),1000)]
for k_name in ['gaussian','tophat','exponential','linear']:
    kde = KernelDensity(kernel=k_name, bandwidth=0.3).fit(kde_sample)
    print(k_name, kde.score(kde_sample))

gaussian 1364.1610437439879
tophat 9281.94793749947
exponential -5732.513651841835
linear 10874.333054024417


In [52]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [defaultdict(list),defaultdict(list)]
for i in range(2,3):
    top_k = 3
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[0]["Score"].append(coarse_score)
    rand_forest_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[1]["Score"].append(fine_score)
    rand_forest_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = rand_forest_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = rand_forest_perf[1]

Random Forest Classification
max_depth: 2
Coarse: 0.5066812439261419
	Top-3: 0.8830782312925171
Fine: 0.42608114674441205
	Top-3: 0.6678814382896016


In [53]:
# AdaBoost Classifier
print("AdaBoost Classification")
ada_boost_perf = [defaultdict(list),defaultdict(list)]
for n in range(50,51,50):
    top_k = 3
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[0]["Score"].append(coarse_score)
    ada_boost_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[1]["Score"].append(fine_score)
    ada_boost_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = ada_boost_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = ada_boost_perf[1]

AdaBoost Classification
n_estimators: 50
Coarse: 0.4603377065111759
	Top-3: 0.8626093294460642
Fine: 0.42419825072886297
	Top-3: 0.6519071914480078


## KDE Naive Bayes

Use Kernel Density Estimates as the likelihood of data given a label. Better results than using combination of Naive Bayes classifiers.

### Coarse label prediction with KDE

In [54]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train_types)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train_types)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [55]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.5017006802721088
	Top-3: 0.8493683187560739


In [56]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_dis: 0.543974732750243
	Top-3: 0.8853255587949466


In [57]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.5034013605442177
	Top-3: 0.8483965014577259


  return result / result.sum(1, keepdims=True)


In [58]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_exp_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_all: 0.5483479105928085
	Top-3: 0.879980563654033


### Fine label predictions with KDE

In [59]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [60]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.4035471331389699
	Top-3: 0.6669096209912536


In [61]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_dis:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

KDE_exp_model_dis: 0.446064139941691
	Top-3: 0.7050534499514092


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Thor,1.0,1,0.000243,
1,SCV,0.96261,1471,0.357386,
2,Marine,0.9274,854,0.207483,
3,Reaper,0.803571,56,0.013605,
4,Hellion,0.741176,85,0.020651,
5,SupplyDepot,0.670927,313,0.076045,
6,Factory,0.666667,60,0.014577,
7,Starport,0.62069,29,0.007046,
8,RefineryRich,0.540541,111,0.026968,
9,CommandCenter,0.494949,99,0.024052,


In [62]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.4098639455782313
	Top-3: 0.663022351797862


  return result / result.sum(1, keepdims=True)


In [63]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['all'][f"{KDE_exp_model_all}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_all:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
# formatted_data #(hide)

KDE_exp_model_all: 0.44436345966958213
	Top-3: 0.6919339164237124


## Naive Bayes (Gaussian, Multinomial, Complement)

**Gaussian likelihood** assuming feature-value given label is normally distributed.

**Multinomial likelihood** for discrete and fractional valued features, problematic when dataset label proportion is imbalanced.

**Complement likelihood** which assigns likelihood based on how disimilar the features are for other labels instead, fixing the imbalanced dataset issue for multinomial.

In [64]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Continuous Features

Gaussian
Coarse Predictions: 0.40992468415937805
	Top-3: 0.6655126336248786
Fine Predictions: 0.04980563654033042
	Top-3: 0.22284985422740525

Multinomial
Coarse Predictions: 0.49252915451895046
	Top-3: 0.8779154518950437
Fine Predictions: 0.40992468415937805
	Top-3: 0.6487487852283771

Complement
Coarse Predictions: 0.49386540330417883
	Top-3: 0.6686103012633625
Fine Predictions: 0.38411078717201164
	Top-3: 0.5202259475218659


In [65]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Discrete Features

Gaussian
Coarse Predictions: 0.4416302235179786
	Top-3: 0.6526967930029155
Fine Predictions: 0.02520651117589893
	Top-3: 0.05885568513119534

Multinomial
Coarse Predictions: 0.4823250728862974
	Top-3: 0.6769314868804664
Fine Predictions: 0.17784256559766765
	Top-3: 0.4074344023323615

Complement
Coarse Predictions: 0.49690233236151604
	Top-3: 0.6928449951409135
Fine Predictions: 0.423955296404276
	Top-3: 0.5835762876579204


In [66]:
def single_model_all(clf,coarse=False):
    if (coarse):
        train_y = Y_train_types
        test_y = Y_test_types
    else:
        train_y = Y_train
        test_y = Y_test
    clf.fit(X_train_all,train_y)
    score = clf.score(X_test_all,test_y)
    print(f"ALL Model ({clf}):",score)
    top_k = 3
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],test_y,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse" if coarse else "Fine"]["all"][f"{clf}"] = {"Score": score, f"Top-{top_k}": top_k_s}
    
models = [lambda: GaussianNB(),lambda: MultinomialNB(),lambda: ComplementNB()]

print("Coarse Predictions:")
top_coarse_mixture = None
top_coarse_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train_types)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test_types,k=top_k)
        score = np.mean(top_label == Y_test_types)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_coarse_k_score):
            top_coarse_mixture = model
            top_coarse_k_score = top_k_s
        summary[race]["Coarse"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB(),coarse=True)
single_model_all(MultinomialNB(),coarse=True)

print("\nFine Predictions:")
top_fine_mixture = None
top_fine_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test,k=top_k)
        score = np.mean(top_label == Y_test)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_fine_k_score):
            top_fine_mixture = model
            top_fine_k_score = top_k_s
        summary[race]["Fine"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB())
single_model_all(MultinomialNB())

Coarse Predictions:
Mixture Model (GaussianNB() + GaussianNB()): 0.4463678328474247
	Top-3: 0.6611394557823129
Mixture Model (GaussianNB() + MultinomialNB()): 0.4730928085519922
	Top-3: 0.6785714285714286
Mixture Model (GaussianNB() + ComplementNB()): 0.4665330417881438
	Top-3: 0.6684280855199223
Mixture Model (MultinomialNB() + GaussianNB()): 0.4888848396501458
	Top-3: 0.6779033041788144
Mixture Model (MultinomialNB() + MultinomialNB()): 0.49295432458697763
	Top-3: 0.7072400388726919
Mixture Model (MultinomialNB() + ComplementNB()): 0.4983600583090379
	Top-3: 0.748117103984451
Mixture Model (ComplementNB() + GaussianNB()): 0.4446064139941691
	Top-3: 0.6534256559766763
Mixture Model (ComplementNB() + MultinomialNB()): 0.48390427599611274
	Top-3: 0.6778425655976676
Mixture Model (ComplementNB() + ComplementNB()): 0.4982385811467444
	Top-3: 0.6930272108843537
ALL Model (ComplementNB()): 0.48760932944606417
	Top-3: 0.6811831875607386
ALL Model (MultinomialNB()): 0.11649659863945579
	Top-3

## Visualizing Prediction accuracies per label

Interesting to dig into just what are getting predicted correctly/wrongly and whether it is performing above naive levels of guessing 3-most frequent items

In [67]:
# Individual class prediction accuracies
indiv, indiv_cols, score = score_individual(top_coarse_mixture.predict(X_test_con, X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions {top_coarse_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4983600583090379


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.860782,5732,0.348154,0.434637
1,Army,0.53439,6121,0.371781,0.639867
2,Upgrade,0.0,311,0.01889,
3,"Static Defense,Building",0.0,273,0.016582,
4,"Technology,Building",0.0,207,0.012573,
5,"Economy,Building",0.0,2601,0.157981,
6,"Building,Army",0.0,650,0.03948,
7,"Technology,Building,Army",0.0,569,0.03456,


In [68]:
indiv, indiv_cols, score = score_individual(top_fine_mixture.predict(X_test_con, X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions {top_fine_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4266885325558795


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,SCV,0.883112,5732,0.348154,0.427534
1,Marine,0.562178,3490,0.211978,0.425412
2,Marauder,0.001802,555,0.03371,0.083333
3,Advanced Ballistics,0.0,1,6.1e-05,
4,Armory,0.0,52,0.003158,
5,Banshee,0.0,29,0.001761,
6,Barracks,0.0,350,0.021259,
7,BarracksReactor,0.0,189,0.01148,
8,BarracksTechLab,0.0,107,0.006499,
9,Battlecruiser,0.0,9,0.000547,


In [69]:
top_k = 3
top_k_s, matches, top_label = top_k_score(top_fine_mixture,[X_test_con,X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions {top_fine_mixture} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()) Top-3: 0.623117103984451


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,SCV,0.932484,5732,0.348154,
1,SupplyDepot,0.910714,1400,0.085034,
2,Marine,0.767335,3490,0.211978,
3,Marauder,0.727928,555,0.03371,
4,RefineryRich,0.662577,489,0.029701,
5,SiegeTank,0.352941,459,0.027879,
6,Medivac,0.107463,335,0.020347,
7,WidowMine,0.091743,218,0.013241,
8,Hellion,0.044643,336,0.020408,
9,Advanced Ballistics,0.0,1,6.1e-05,


In [70]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Continuous GaussianNB(): 0.40992468415937805


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.695918,5732,0.348154,0.466223
1,Upgrade,0.488746,311,0.01889,0.038895
2,Army,0.426074,6121,0.371781,0.652
3,"Static Defense,Building",0.0,273,0.016582,
4,"Technology,Building",0.0,207,0.012573,
5,"Economy,Building",0.0,2601,0.157981,
6,"Building,Army",0.0,650,0.03948,
7,"Technology,Building,Army",0.0,569,0.03456,


In [71]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Discrete ComplementNB(): 0.49690233236151604


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.865492,5732,0.348154,0.433048
1,Army,0.526058,6121,0.371781,0.642971
2,Upgrade,0.0,311,0.01889,
3,"Static Defense,Building",0.0,273,0.016582,
4,"Technology,Building",0.0,207,0.012573,
5,"Economy,Building",0.0,2601,0.157981,
6,"Building,Army",0.0,650,0.03948,
7,"Technology,Building,Army",0.0,569,0.03456,


In [72]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB(): 0.04980563654033042


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Reaper,0.917031,229,0.013909,0.05299
1,Cloaking Field,0.846154,13,0.00079,0.006138
2,TerranInfantryWeaponsLevel1,0.492308,65,0.003948,0.012992
3,Raven,0.482143,56,0.003401,0.013946
4,Starport,0.288462,104,0.006317,0.079576
5,TerranInfantryWeaponsLevel2,0.238095,21,0.001276,0.006443
6,TerranInfantryArmorsLevel1,0.216216,37,0.002247,0.007866
7,Ghost,0.214286,42,0.002551,0.014423
8,SCV,0.084962,5732,0.348154,0.997951
9,TerranInfantryArmorsLevel2,0.071429,14,0.00085,0.001198


In [73]:
top_k = 3
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
top_k_s, matches, top_label = top_k_score(clf_con,[X_test_con],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Continuous {clf_con} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB() Top-3: 0.22284985422740525


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,TerranInfantryArmorsLevel3,1.0,2,0.000121,
1,TerranShipWeaponsLevel1,1.0,1,6.1e-05,
2,Reaper,0.930131,229,0.013909,
3,Cloaking Field,0.923077,13,0.00079,
4,Raven,0.910714,56,0.003401,
5,Factory,0.785714,196,0.011905,
6,TerranInfantryWeaponsLevel2,0.761905,21,0.001276,
7,Starport,0.75,104,0.006317,
8,Banshee,0.586207,29,0.001761,
9,Stimpack,0.586207,58,0.003523,


In [74]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB(): 0.423955296404276


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,SCV,0.873168,5732,0.348154,0.429172
1,Marine,0.546132,3490,0.211978,0.433773
2,Marauder,0.124324,555,0.03371,0.175127
3,Advanced Ballistics,0.0,1,6.1e-05,
4,Armory,0.0,52,0.003158,
5,Banshee,0.0,29,0.001761,
6,Barracks,0.0,350,0.021259,
7,BarracksReactor,0.0,189,0.01148,
8,BarracksTechLab,0.0,107,0.006499,
9,Battlecruiser,0.0,9,0.000547,


In [75]:
top_k = 3
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
top_k_s, matches, top_label = top_k_score(clf_dis,[X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Discrete {clf_dis} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB() Top-3: 0.5835762876579204


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Hellion,0.994048,336,0.020408,
1,SCV,0.897592,5732,0.348154,
2,Marauder,0.85045,555,0.03371,
3,SupplyDepot,0.751429,1400,0.085034,
4,Marine,0.602006,3490,0.211978,
5,WidowMine,0.568807,218,0.013241,
6,RefineryRich,0.335378,489,0.029701,
7,Medivac,0.304478,335,0.020347,
8,SiegeTank,0.244009,459,0.027879,
9,Ghost,0.02381,42,0.002551,


# Zerg

In [76]:
# Control Variables
race = 'Z' # Change to run for a different race
k_con_features = 10
k_dis_features = 30

In [77]:
print("===== CONTINUOUS =====")
X, y = con_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(con_dfs[race])
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_con_features]:
    print(f"{name}: {score:.3f}")
selected_con_features = [name for name,score in feature_scores[:k_con_features]]
print("===== DISCRETE =====")
X, y = dis_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(X)
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores[:k_dis_features]:
    print(f"{name}: {score:.3f}")
selected_dis_features = [name for name,score in feature_scores[:k_dis_features]]

===== CONTINUOUS =====
mineral_value_current_technology: 765.635
mineral_spend: 753.835
mineral_total_technology: 752.671
mineral_total_economic: 731.802
mineral_value_current_economic: 727.286
worker_supply_ratio: 716.572
vespene_spend: 664.375
mineral_value_current_army: 658.207
vespene_value_current_army: 644.681
mineral_total_army: 640.846
===== DISCRETE =====
Timestamp: 820.575
Overlord: 810.407
supply_available: 761.603
supply_consumed: 700.996
Extractor: 688.132
workers_active: 660.226
Drone: 657.311
HydraliskDen: 531.017
Hatchery: 513.850
Lair: 499.449
GlialReconstitution: 495.703
Roach: 477.839
RoachWarren: 466.857
Hydralisk: 462.017
ZergMissileWeaponsLevel1: 450.144
Zergling: 446.403
EvolveGroovedSpines: 431.216
SpawningPool: 388.465
EvolveMuscularAugments: 388.450
Queen: 383.122
CentrificalHooks: 351.088
Spire: 345.107
zerglingmovementspeed: 339.122
Mutalisk: 338.502
ZergMeleeWeaponsLevel1: 314.778
EvolutionChamber: 313.067
Corruptor: 310.680
BanelingNest: 304.230
LurkerDenM

In [78]:
# Grab Train/Test data split
X_con, X_dis, X_all, Y_actions, Y, Y_types = dataset_processing(con_dfs, dis_dfs, target_dfs, race, con_features=selected_con_features, dis_features=selected_dis_features)
X_train_con, X_valid_con, X_test_con = X_con
X_train_dis, X_valid_dis, X_test_dis = X_dis
X_train_all, X_valid_all, X_test_all = X_all
Y_train, Y_valid, Y_test = Y
Y_train_types, Y_valid_types, Y_test_types = Y_types

# Keep a copy of the data for replicability
data[race]["X_train_con"], data[race]["X_valid_con"], data[race]["X_test_con"] = X_con
data[race]["X_train_dis"], data[race]["X_valid_dis"], data[race]["X_test_dis"] = X_dis
data[race]["X_train_all"], data[race]["X_valid_all"], data[race]["X_test_all"] = X_all
data[race]["Y_train"], data[race]["Y_valid"], data[race]["Y_test"] = Y
data[race]["Y_train_types"], data[race]["Y_valid_types"], data[race]["Y_test_types"] = Y

0.3216570624801488 0.5959208675529742
0.3339307591088525 0.8537819320295839


In [79]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_actions:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_actions),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

010000
	Army
	0.334, 29438
000100
	Worker
	0.322, 28356
001100
	Economy
	Building
	0.198, 17472
001010
	Technology
	Building
	0.050, 4429
010100
	Economy
	Army
	0.045, 3956
100000
	Upgrade
	0.030, 2671
001001
	Static Defense
	Building
	0.021, 1834


In [80]:
# Kernel Density Estimate
kde_sample = X_train_con.loc[random.sample(X_train_con.index.tolist(),1000)]
for k_name in ['gaussian','tophat','exponential','linear']:
    kde = KernelDensity(kernel=k_name, bandwidth=0.3).fit(kde_sample)
    print(k_name, kde.score(kde_sample))

gaussian 1355.8917603656027
tophat 9274.897807359863
exponential -5740.220150919038
linear 10872.757727754699


In [81]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [defaultdict(list),defaultdict(list)]
for i in range(2,3):
    top_k = 3
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[0]["Score"].append(coarse_score)
    rand_forest_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    rand_forest_perf[1]["Score"].append(fine_score)
    rand_forest_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = rand_forest_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = rand_forest_perf[1]

Random Forest Classification
max_depth: 2
Coarse: 0.5014178765880217
	Top-3: 0.8588929219600726
Fine: 0.39575771324863884
	Top-3: 0.6381011796733213


In [82]:
# AdaBoost Classifier
print("AdaBoost Classification")
ada_boost_perf = [defaultdict(list),defaultdict(list)]
for n in range(50,51,50):
    top_k = 3
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    
    # Fit on Coarse labels
    clf.fit(X_train_all,Y_train_types)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test_types,k=top_k)
    coarse_score = np.mean(top_label == Y_test_types)
    print("Coarse:",coarse_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[0]["Score"].append(coarse_score)
    ada_boost_perf[0][f"Top-{top_k}"].append(top_k_s)
    
    # Fit on Fine labels
    clf.fit(X_train_all,Y_train)
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],Y_test,k=top_k)
    fine_score = np.mean(top_label == Y_test)
    print("Fine:",fine_score)
    print(f"\tTop-{top_k}: {top_k_s}")
    ada_boost_perf[1]["Score"].append(fine_score)
    ada_boost_perf[1][f"Top-{top_k}"].append(top_k_s)
    
summary[race]["Coarse"]["baseline"][f"{clf}"] = ada_boost_perf[0]
summary[race]["Fine"]["baseline"][f"{clf}"] = ada_boost_perf[1]

AdaBoost Classification
n_estimators: 50
Coarse: 0.519850272232305
	Top-3: 0.8787431941923775
Fine: 0.14694872958257713
	Top-3: 0.34647232304900183


## KDE Naive Bayes

Use Kernel Density Estimates as the likelihood of data given a label. Better results than using combination of Naive Bayes classifiers.

### Coarse label prediction with KDE

In [83]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train_types)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train_types)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train_types)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [84]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.5130474245518494
	Top-3: 0.8282278193782618


In [85]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_dis: 0.5321080099841162
	Top-3: 0.8561379623326526


In [86]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.4824143408214205
	Top-3: 0.8148400272294077


  return result / result.sum(1, keepdims=True)


In [87]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid_types,k=top_k)
score = np.mean(top_label == Y_valid_types)
print("KDE_exp_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Coarse']['all'][f"{KDE_exp_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_exp_model_all: 0.515543453596551
	Top-3: 0.8459269344225097


### Fine label predictions with KDE

In [88]:
print("Discrete Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_dis = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_dis.fit(X_train_dis, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_dis = KDEClassifier(kernel='linear')
# KDE_linear_model_dis.fit(X_train_dis, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_dis  = KDEClassifier(kernel='exponential')
KDE_exp_model_dis.fit(X_train_dis, Y_train)

print("All Features:")
print("Fitting KDE Gaussian Kernel Model...")
KDE_gaussian_model_all = KDEClassifier(kernel='gaussian')
KDE_gaussian_model_all.fit(X_train_all, Y_train)
# print("Fitting KDE Linear Kernel Model...")
# KDE_linear_model_all = KDEClassifier(kernel='linear')
# KDE_linear_model_all.fit(X_train_all, Y_train)
print("Fitting KDE Exponential Kernel Model...")
KDE_exp_model_all  = KDEClassifier(kernel='exponential')
KDE_exp_model_all.fit(X_train_all, Y_train)

Discrete Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...
All Features:
Fitting KDE Gaussian Kernel Model...
Fitting KDE Exponential Kernel Model...


KDEClassifier(kernel='exponential')

In [89]:
top_k = 3
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_dis,[X_valid_dis],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_dis:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['discrete'][f"{KDE_gaussian_model_dis}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_dis: 0.42205582028590877
	Top-3: 0.6848196051735874


In [90]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_dis,[X_valid_dis],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['discrete'][f"{KDE_exp_model_dis}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_dis:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

KDE_exp_model_dis: 0.4388472884048105
	Top-3: 0.7120490129339687


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,TunnelingClaws,1.0,1,0.000227,
1,SpawningPool,0.962264,53,0.012026,
2,Drone,0.92092,1391,0.315634,
3,Roach,0.830729,384,0.087134,
4,Hydralisk,0.78125,96,0.021784,
5,Zergling,0.772231,641,0.14545,
6,Overlord,0.771231,577,0.130928,
7,SwarmHost,0.75,4,0.000908,
8,Corruptor,0.733333,15,0.003404,
9,Mutalisk,0.685714,105,0.023826,


In [91]:
top_k_s, top_k_matches, top_label = top_k_score(KDE_gaussian_model_all,[X_valid_all],Y_valid,k=top_k)
score = np.mean(top_label == Y_valid)
print("KDE_gaussian_model_all:",score)
print(f"\tTop-{top_k}: {top_k_s}")
summary[race]['Fine']['all'][f"{KDE_gaussian_model_all}"] = {"Score": score, f"Top-{top_k}": top_k_s}

KDE_gaussian_model_all: 0.3759927388245972
	Top-3: 0.6367143181302474


  return result / result.sum(1, keepdims=True)


In [92]:
top_k = 3
top_k_s, matches, top_label = top_k_score(KDE_exp_model_all,[X_valid_all],Y_valid,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_valid,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
top_1_score = np.mean(top_label == Y_valid)
summary[race]['Fine']['all'][f"{KDE_exp_model_all}"] = {"Score": top_1_score, f"Top-{top_k}": top_k_s}
print("KDE_exp_model_all:",top_1_score)
print(f"\tTop-{top_k}: {top_k_s}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
# formatted_data #(hide)

KDE_exp_model_all: 0.403902881778988
	Top-3: 0.6709779895620603


## Naive Bayes (Gaussian, Multinomial, Complement)

**Gaussian likelihood** assuming feature-value given label is normally distributed.

**Multinomial likelihood** for discrete and fractional valued features, problematic when dataset label proportion is imbalanced.

**Complement likelihood** which assigns likelihood based on how disimilar the features are for other labels instead, fixing the imbalanced dataset issue for multinomial.

In [93]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_con],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["continuous"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Continuous Features

Gaussian
Coarse Predictions: 0.4952359346642468
	Top-3: 0.7330421960072595
Fine Predictions: 0.05813294010889292
	Top-3: 0.37726860254083483

Multinomial
Coarse Predictions: 0.4951225045372051
	Top-3: 0.8496483666061706
Fine Predictions: 0.34613203266787657
	Top-3: 0.607815335753176

Complement
Coarse Predictions: 0.4951225045372051
	Top-3: 0.6871597096188747
Fine Predictions: 0.3055807622504537
	Top-3: 0.49364791288566245


In [94]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    top_k = 3
    print(f"\n{name}")
    
    # Fit on Coarse labels
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test_types,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}
    
    # Fit on Fine labels
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_dis],Y_test,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Fine"]["discrete"][f"{clf}"] = {"Score": s, f"Top-{top_k}": top_k_s}

Prediction using ONLY Discrete Features

Gaussian
Coarse Predictions: 0.3651315789473684
	Top-3: 0.7124546279491834
Fine Predictions: 0.03584392014519056
	Top-3: 0.13668330308529947

Multinomial
Coarse Predictions: 0.43818058076225047
	Top-3: 0.7028130671506352
Fine Predictions: 0.19368194192377494
	Top-3: 0.4544010889292196

Complement
Coarse Predictions: 0.48877041742286753
	Top-3: 0.7296960072595281
Fine Predictions: 0.4105603448275862
	Top-3: 0.6481397459165155


In [95]:
def single_model_all(clf,coarse=False):
    if (coarse):
        train_y = Y_train_types
        test_y = Y_test_types
    else:
        train_y = Y_train
        test_y = Y_test
    clf.fit(X_train_all,train_y)
    score = clf.score(X_test_all,test_y)
    print(f"ALL Model ({clf}):",score)
    top_k = 3
    top_k_s, top_k_matches, top_label = top_k_score(clf,[X_test_all],test_y,k=top_k)
    print(f"\tTop-{top_k}: {top_k_s}")
    summary[race]["Coarse" if coarse else "Fine"]["all"][f"{clf}"] = {"Score": score, f"Top-{top_k}": top_k_s}
    
models = [lambda: GaussianNB(),lambda: MultinomialNB(),lambda: ComplementNB()]

print("Coarse Predictions:")
top_coarse_mixture = None
top_coarse_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train_types)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test_types,k=top_k)
        score = np.mean(top_label == Y_test_types)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_coarse_k_score):
            top_coarse_mixture = model
            top_coarse_k_score = top_k_s
        summary[race]["Coarse"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB(),coarse=True)
single_model_all(MultinomialNB(),coarse=True)

print("\nFine Predictions:")
top_fine_mixture = None
top_fine_k_score = 0
for i in range(len(models)):
    for j in range(len(models)):
        model = MixedNB(models[i](),models[j]())
        model.fit(X_train_con, X_train_dis, Y_train)
        top_k = 3
        top_k_s, top_k_matches, top_label = top_k_score(model,[X_test_con,X_test_dis],Y_test,k=top_k)
        score = np.mean(top_label == Y_test)
        print(f"Mixture Model ({model.clf_con} + {model.clf_dis}):", score)
        print(f"\tTop-{top_k}: {top_k_s}")
        if (top_k_s > top_fine_k_score):
            top_fine_mixture = model
            top_fine_k_score = top_k_s
        summary[race]["Fine"]["all"][f"{model}"] = {"Score": score, f"Top-{top_k}": top_k_s}
single_model_all(ComplementNB())
single_model_all(MultinomialNB())

Coarse Predictions:
Mixture Model (GaussianNB() + GaussianNB()): 0.4724364791288566
	Top-3: 0.7189201451905626
Mixture Model (GaussianNB() + MultinomialNB()): 0.4529832123411978
	Top-3: 0.7131919237749547
Mixture Model (GaussianNB() + ComplementNB()): 0.49183303085299457
	Top-3: 0.7189768602540835
Mixture Model (MultinomialNB() + GaussianNB()): 0.48616152450090744
	Top-3: 0.7362749546279492
Mixture Model (MultinomialNB() + MultinomialNB()): 0.4503743194192377
	Top-3: 0.7176724137931034
Mixture Model (MultinomialNB() + ComplementNB()): 0.4896778584392015
	Top-3: 0.7701338475499092
Mixture Model (ComplementNB() + GaussianNB()): 0.37777903811252267
	Top-3: 0.7126247731397459
Mixture Model (ComplementNB() + MultinomialNB()): 0.43976860254083483
	Top-3: 0.7037205081669692
Mixture Model (ComplementNB() + ComplementNB()): 0.4896778584392015
	Top-3: 0.7308303085299456
ALL Model (ComplementNB()): 0.48400635208711434
	Top-3: 0.7184097096188747
ALL Model (MultinomialNB()): 0.2753516333938294
	Top

## Visualizing Prediction accuracies per label

Interesting to dig into just what are getting predicted correctly/wrongly and whether it is performing above naive levels of guessing 3-most frequent items

In [96]:
# Individual class prediction accuracies
indiv, indiv_cols, score = score_individual(top_coarse_mixture.predict(X_test_con, X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions {top_coarse_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4896778584392015


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.882519,5652,0.320554,0.416917
1,Army,0.630612,5769,0.327189,0.645722
2,"Economy,Building",0.002247,3560,0.201906,0.235294
3,Upgrade,0.0,514,0.029152,
4,"Static Defense,Building",0.0,395,0.022402,
5,"Technology,Building",0.0,932,0.052858,
6,"Economy,Army",0.0,810,0.045939,


In [97]:
indiv, indiv_cols, score = score_individual(top_fine_mixture.predict(X_test_con, X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions {top_fine_mixture}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()): 0.4092558983666062


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Drone,0.861819,5652,0.320554,0.434368
1,Roach,0.639587,1551,0.087965,0.381538
2,Zergling,0.470565,2514,0.142582,0.360451
3,Hydralisk,0.372973,370,0.020985,0.298056
4,Corruptor,0.105263,95,0.005388,0.454545
5,Mutalisk,0.058824,374,0.021211,0.431373
6,Adaptive Talons,0.0,6,0.00034,
7,Anabolic Synthesis,0.0,1,5.7e-05,
8,Baneling,0.0,409,0.023196,
9,BanelingNest,0.0,84,0.004764,


In [98]:
top_k = 3
top_k_s, matches, top_label = top_k_score(top_fine_mixture,[X_test_con,X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions {top_fine_mixture} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions MixedNB(clf_con=MultinomialNB(), clf_dis=ComplementNB()) Top-3: 0.6694078947368421


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Drone,0.935244,5652,0.320554,
1,Overlord,0.880304,2239,0.126985,
2,Roach,0.874275,1551,0.087965,
3,Hydralisk,0.837838,370,0.020985,
4,Mutalisk,0.76738,374,0.021211,
5,Queen,0.751852,810,0.045939,
6,Zergling,0.650358,2514,0.142582,
7,Baneling,0.601467,409,0.023196,
8,Corruptor,0.536842,95,0.005388,
9,Ravager,0.221239,226,0.012818,


In [99]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Continuous GaussianNB(): 0.4952359346642468


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.774062,5652,0.320554,0.458932
1,Army,0.702375,5769,0.327189,0.592571
2,"Economy,Building",0.085674,3560,0.201906,0.241872
3,Upgrade,0.0,514,0.029152,
4,"Static Defense,Building",0.0,395,0.022402,
5,"Technology,Building",0.0,932,0.052858,
6,"Economy,Army",0.0,810,0.045939,


In [100]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train_types)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test_types,mapping=lambda x: decode_type(x,suppress=True))
print(f"Coarse Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Coarse Predictions Discrete ComplementNB(): 0.48877041742286753


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Worker,0.886235,5652,0.320554,0.41693
1,Army,0.623158,5769,0.327189,0.648449
2,"Economy,Building",0.003933,3560,0.201906,0.189189
3,Upgrade,0.0,514,0.029152,
4,"Static Defense,Building",0.0,395,0.022402,
5,"Technology,Building",0.0,932,0.052858,
6,"Economy,Army",0.0,810,0.045939,


In [101]:
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
indiv, indiv_cols, score = score_individual(clf_con.predict(X_test_con),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Continuous {clf_con}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB(): 0.05813294010889292


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,SpawningPool,0.987603,242,0.013725,0.077749
1,zerglingmovementspeed,0.844961,129,0.007316,0.034298
2,Ultralisk,0.5,8,0.000454,0.008048
3,GlialReconstitution,0.487179,39,0.002212,0.006939
4,Lair,0.478992,119,0.006749,0.024224
5,Mutalisk,0.149733,374,0.021211,0.055118
6,Roach,0.102515,1551,0.087965,0.173014
7,Drone,0.067233,5652,0.320554,0.409483
8,CentrificalHooks,0.054054,37,0.002098,0.002813
9,Adaptive Talons,0.0,6,0.00034,


In [102]:
top_k = 3
clf_con = GaussianNB()
clf_con.fit(X_train_con,Y_train)
top_k_s, matches, top_label = top_k_score(clf_con,[X_test_con],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Continuous {clf_con} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Continuous GaussianNB() Top-3: 0.37726860254083483


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Anabolic Synthesis,1.0,1,5.7e-05,
1,SpawningPool,0.987603,242,0.013725,
2,zerglingmovementspeed,0.953488,129,0.007316,
3,Drone,0.774062,5652,0.320554,
4,Ultralisk,0.625,8,0.000454,
5,Lair,0.605042,119,0.006749,
6,CentrificalHooks,0.594595,37,0.002098,
7,BroodLord,0.5,2,0.000113,
8,ZergFlyerWeaponsLevel2,0.5,2,0.000113,
9,ZergMissileWeaponsLevel3,0.5,2,0.000113,


In [103]:
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
indiv, indiv_cols, score = score_individual(clf_dis.predict(X_test_dis),Y_test,mapping=lambda x: rev_ACTIONS[race][x])
print(f"Fine Predictions Discrete {clf_dis}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB(): 0.4105603448275862


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Drone,0.846249,5652,0.320554,0.444145
1,Roach,0.669246,1551,0.087965,0.370318
2,Zergling,0.472554,2514,0.142582,0.359782
3,Hydralisk,0.464865,370,0.020985,0.302285
4,Corruptor,0.126316,95,0.005388,0.292683
5,Mutalisk,0.122995,374,0.021211,0.310811
6,Adaptive Talons,0.0,6,0.00034,
7,Anabolic Synthesis,0.0,1,5.7e-05,
8,Baneling,0.0,409,0.023196,
9,BanelingNest,0.0,84,0.004764,


In [104]:
top_k = 3
clf_dis = ComplementNB()
clf_dis.fit(X_train_dis,Y_train)
top_k_s, matches, top_label = top_k_score(clf_dis,[X_test_dis],Y_test,k=top_k)
indiv, indiv_cols, score = score_individual(matches,Y_test,mapping=lambda x: rev_ACTIONS[race][x],match_literal=True)
print(f"Fine Predictions Discrete {clf_dis} Top-{top_k}: {score}")
formatted_data = pd.DataFrame(indiv, columns=indiv_cols)
formatted_data

Fine Predictions Discrete ComplementNB() Top-3: 0.6481397459165155


Unnamed: 0,Label,Accuracy,Label_Frequency,Percentage_Frequency,Label_Prediction_Accuracy
0,Drone,0.908882,5652,0.320554,
1,Hydralisk,0.87027,370,0.020985,
2,Mutalisk,0.863636,374,0.021211,
3,Roach,0.852998,1551,0.087965,
4,Queen,0.787654,810,0.045939,
5,Overlord,0.780259,2239,0.126985,
6,Corruptor,0.736842,95,0.005388,
7,Baneling,0.689487,409,0.023196,
8,Zergling,0.58393,2514,0.142582,
9,Ravager,0.433628,226,0.012818,
