In [1]:
import os
import sys
import random
import datetime
import itertools
import numpy as np
import pandas as pd
from zipUtil import zip_write, zip_read
from collections import defaultdict

In [2]:
from dataset_config import decode_type, PROTOSS_ACTIONS, PROTOSS_ACTIONS_TYPE, TERRAN_ACTIONS, TERRAN_ACTIONS_TYPE, ZERG_ACTIONS, ZERG_ACTIONS_TYPE
ACTIONS = {'P': PROTOSS_ACTIONS, 'T': TERRAN_ACTIONS, 'Z': ZERG_ACTIONS}
rev_ACTIONS = {race: {v:k for k,v in ACTIONS[race].items()} for race in 'PTZ'}
ACTIONS_TYPE = {'P': PROTOSS_ACTIONS_TYPE, 'T': TERRAN_ACTIONS_TYPE, 'Z': ZERG_ACTIONS_TYPE}
GATHERER_NAMES = {'P': 'Probe', 'T': 'SCV', 'Z': 'Drone'}

In [3]:
# read the dataset dictionaries
con_dfs = zip_read('dataframes_continuous')
dis_dfs = zip_read('dataframes_discrete')
target_dfs = zip_read('dataframes_target')

In [4]:
# Dataset cleanup
for race in "PTZ":
    # Clean up some useless columns
    try:
        con_dfs[race] = con_dfs[race].drop(columns=['vespene_queued_economic','vespene_total_economic','vespene_value_current_economic'])
    except:
        pass
    # Clean up incorrectly labeled columns
    incorrect_labels = target_dfs[race][~target_dfs[race]['Target'].isin(rev_ACTIONS[race])].index.tolist()
    if (len(incorrect_labels) > 0):
        print(f"{race} dropping {len(incorrect_labels)} rows due to incorrect labelling")
        print(target_dfs[race]['Target'][incorrect_labels])
        con_dfs[race] = con_dfs[race].drop(incorrect_labels)
        dis_dfs[race] = dis_dfs[race].drop(incorrect_labels)
        target_dfs[race] = target_dfs[race].drop(incorrect_labels)
    # Move ['Timestamp','supply_available','supply_consumed'] fields from continuous dataset into discrete dataset
    discrete_features = ['Timestamp','supply_available','supply_consumed']
    dis_dfs[race] = pd.concat([dis_dfs[race],con_dfs[race][discrete_features]],axis=1)
    con_dfs[race] = con_dfs[race].drop(columns=discrete_features)

In [5]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [6]:
# Remap fine-grained action labels into broader action-class labels (as defined in dataset_config.py)
# 0b[UABETD] 6-bit encoding
# U: is_upgrade?
# A: is_army?
# B: is_building?
# E: for_economy?
# T: for_tech?
# D: for_defense? (static defenses only)
def map_to_action_type(Y_labels,race):
    return np.array([ACTIONS_TYPE[race][rev_ACTIONS[race][lbl]] for lbl in Y_labels])

In [7]:
# Helper function to independently (naively) mix prediction probabilities
# from multiple naive bayes classifiers
def mix_naive_bayes(modalities,n_samples,classes):
    probs = np.ones((n_samples,len(classes)))
    for clf, X_test in modalities:
        probs *= clf.predict_proba(X_test)
    result = np.array([classes[i] for i in np.argmax(probs,axis=1)])
    print(result)
    return result

In [8]:
race = 'P'

In [9]:
# Full dataset
X_con = con_dfs[race]
X_dis = dis_dfs[race]
Y = target_dfs[race]['Target'].to_numpy()
# Combining the two modalities
full_index = target_dfs[race].index.tolist()
# Shuffle into random order
random.seed(0)
random.shuffle(full_index)
# Cut into 80/20 split
cutoff = int(0.80*len(full_index))
train_index = full_index[:cutoff]
test_index = full_index[cutoff:]

X_train_con, X_train_dis = con_dfs[race].loc[train_index], dis_dfs[race].loc[train_index]
X_test_con, X_test_dis = con_dfs[race].loc[test_index], dis_dfs[race].loc[test_index]
Y_train = target_dfs[race].loc[train_index]
Y_test = target_dfs[race].loc[test_index]

# Transform target labels into numpy array
Y_train = Y_train['Target'].to_numpy()
Y_test = Y_test['Target'].to_numpy()

# Remap to coarse label classes
Y_types = map_to_action_type(Y,race)
print(Y_types,len(set(Y_types)))
Y_train_types = map_to_action_type(Y_train,race)
Y_test_types = map_to_action_type(Y_test,race)

[ 4  4 12 ... 16 16 16] 7


In [10]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_types:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_types),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

000100
	Worker
	0.399, 40262
010000
	Army
	0.276, 27789
001100
	Economy
	Building
	0.163, 16399
011000
	Building
	Army
	0.063, 6316
001001
	Static Defense
	Building
	0.041, 4091
001010
	Technology
	Building
	0.035, 3513
100000
	Upgrade
	0.024, 2439


In [11]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [[],[]]
for i in range(2,3):
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    rand_forest_perf[0].append(coarse_score)
    rand_forest_perf[1].append(fine_score)
print("AdaBoost Classification")
ada_boost_perf = [[],[]]
for n in range(50,51,50):
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    ada_boost_perf[0].append(coarse_score)
    ada_boost_perf[1].append(fine_score)

Random Forest Classification
max_depth: 2
Coarse: 0.4981648645967662
Fine: 0.4072512647554806
AdaBoost Classification
n_estimators: 50
Coarse: 0.4453923221902589
Fine: 0.3976292034520385


In [12]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Continuous Features
Gaussian
Coarse Predictions: 0.44752504711834146
Fine Predictions: 0.06551929372086103
Multinomial
Coarse Predictions: 0.3750619978176768
Fine Predictions: 0.02341037595476639
Complement
Coarse Predictions: 0.4772344013490725
Fine Predictions: 0.3459478226366432


In [13]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Discrete Features
Gaussian
Coarse Predictions: 0.39921634758456503
Fine Predictions: 0.02187283007638131
Multinomial
Coarse Predictions: 0.471282610852098
Fine Predictions: 0.22120821347088582
Complement
Coarse Predictions: 0.4906755282214066
Fine Predictions: 0.41945243527427833


In [14]:
clf_con, clf_dis = GaussianNB(), ComplementNB()
clf_con.fit(X_train_con,Y_train_types)
clf_dis.fit(X_train_dis,Y_train_types)
label_classes = clf_con.classes_
print("Mixture",np.mean(mix_naive_bayes([(clf_con,X_test_con),(clf_dis,X_test_dis)],len(X_test_con),label_classes) == Y_test_types))

[ 4 16 16 ... 16  4  4]
Mixture 0.4772344013490725


In [15]:
race = 'T'

In [16]:
# Full dataset
X_con = con_dfs[race]
X_dis = dis_dfs[race]
Y = target_dfs[race]['Target'].to_numpy()
# Combining the two modalities
full_index = target_dfs[race].index.tolist()
# Shuffle into random order
random.seed(0)
random.shuffle(full_index)
# Cut into 80/20 split
cutoff = int(0.80*len(full_index))
train_index = full_index[:cutoff]
test_index = full_index[cutoff:]

X_train_con, X_train_dis = con_dfs[race].loc[train_index], dis_dfs[race].loc[train_index]
X_test_con, X_test_dis = con_dfs[race].loc[test_index], dis_dfs[race].loc[test_index]
Y_train = target_dfs[race].loc[train_index]
Y_test = target_dfs[race].loc[test_index]

# Transform target labels into numpy array
Y_train = Y_train['Target'].to_numpy()
Y_test = Y_test['Target'].to_numpy()

# Remap to coarse label classes
Y_types = map_to_action_type(Y,race)
print(Y_types,len(set(Y_types)))
Y_train_types = map_to_action_type(Y_train,race)
Y_test_types = map_to_action_type(Y_test,race)

[ 4  4 12 ...  4  4  4] 8


In [17]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_types:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_types),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

010000
	Army
	0.369, 30351
000100
	Worker
	0.352, 28964
001100
	Economy
	Building
	0.157, 12887
011000
	Building
	Army
	0.042, 3490
011010
	Technology
	Building
	Army
	0.033, 2696
100000
	Upgrade
	0.019, 1569
001001
	Static Defense
	Building
	0.017, 1416
001010
	Technology
	Building
	0.012, 947


In [18]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [[],[]]
for i in range(2,3):
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    rand_forest_perf[0].append(coarse_score)
    rand_forest_perf[1].append(fine_score)
print("AdaBoost Classification")
ada_boost_perf = [[],[]]
for n in range(50,51,50):
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    ada_boost_perf[0].append(coarse_score)
    ada_boost_perf[1].append(fine_score)

Random Forest Classification
max_depth: 2
Coarse: 0.5078352769679301
Fine: 0.4253522837706511
AdaBoost Classification
n_estimators: 50
Coarse: 0.4579689018464529
Fine: 0.42128279883381925


In [19]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Continuous Features
Gaussian
Coarse Predictions: 0.34566326530612246
Fine Predictions: 0.0614067055393586
Multinomial
Coarse Predictions: 0.17280126336248786
Fine Predictions: 0.01888969873663751
Complement
Coarse Predictions: 0.48226433430515064
Fine Predictions: 0.3923712342079689


In [20]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Discrete Features
Gaussian
Coarse Predictions: 0.05369290573372206
Fine Predictions: 0.024963556851311953
Multinomial
Coarse Predictions: 0.47819484936831874
Fine Predictions: 0.1553692905733722
Complement
Coarse Predictions: 0.4959305150631681
Fine Predictions: 0.42346938775510207


In [21]:
clf_con, clf_dis = GaussianNB(), ComplementNB()
clf_con.fit(X_train_con,Y_train_types)
clf_dis.fit(X_train_dis,Y_train_types)
label_classes = clf_con.classes_
print("Mixture",np.mean(mix_naive_bayes([(clf_con,X_test_con),(clf_dis,X_test_dis)],len(X_test_con),label_classes) == Y_test_types))

[32  4 32 ... 32 32  4]
Mixture 0.42073615160349853


In [22]:
race = 'Z'

In [23]:
# Full dataset
X_con = con_dfs[race]
X_dis = dis_dfs[race]
Y = target_dfs[race]['Target'].to_numpy()
# Combining the two modalities
full_index = target_dfs[race].index.tolist()
# Shuffle into random order
random.seed(0)
random.shuffle(full_index)
# Cut into 80/20 split
cutoff = int(0.80*len(full_index))
train_index = full_index[:cutoff]
test_index = full_index[cutoff:]

X_train_con, X_train_dis = con_dfs[race].loc[train_index], dis_dfs[race].loc[train_index]
X_test_con, X_test_dis = con_dfs[race].loc[test_index], dis_dfs[race].loc[test_index]
Y_train = target_dfs[race].loc[train_index]
Y_test = target_dfs[race].loc[test_index]

# Transform target labels into numpy array
Y_train = Y_train['Target'].to_numpy()
Y_test = Y_test['Target'].to_numpy()

# Remap to coarse label classes
Y_types = map_to_action_type(Y,race)
print(Y_types,len(set(Y_types)))
Y_train_types = map_to_action_type(Y_train,race)
Y_test_types = map_to_action_type(Y_test,race)

[ 4 12  4 ... 16 16 16] 7


In [24]:
# Some prior occurance stats for each label type
lbl_count = defaultdict(int)
for i in Y_types:
    lbl_count[i] += 1
lbl_stats = sorted([(lbl_count[i]/len(Y_types),i) for i in lbl_count.keys()],reverse=True)
for ratio,lbl in lbl_stats:
    decode_type(lbl)
    print(f"\t{ratio:.3f}, {lbl_count[lbl]}")

010000
	Army
	0.334, 29438
000100
	Worker
	0.322, 28356
001100
	Economy
	Building
	0.198, 17472
001010
	Technology
	Building
	0.050, 4429
010100
	Economy
	Army
	0.045, 3956
100000
	Upgrade
	0.030, 2671
001001
	Static Defense
	Building
	0.021, 1834


In [25]:
# Random Forest classifier (control?)
print("Random Forest Classification")
rand_forest_perf = [[],[]]
for i in range(2,3):
    print(f"max_depth: {i}")
    clf = RandomForestClassifier(max_depth=i, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    rand_forest_perf[0].append(coarse_score)
    rand_forest_perf[1].append(fine_score)
print("AdaBoost Classification")
ada_boost_perf = [[],[]]
for n in range(50,51,50):
    print(f"n_estimators: {n}")
    clf = AdaBoostClassifier(n_estimators=n, random_state=0)
    clf.fit(X_train_dis,Y_train_types)
    coarse_score = clf.score(X_test_dis,Y_test_types)
    print("Coarse:",coarse_score)
    clf.fit(X_train_dis,Y_train)
    fine_score = clf.score(X_test_dis,Y_test)
    print("Fine:",fine_score)
    ada_boost_perf[0].append(coarse_score)
    ada_boost_perf[1].append(fine_score)

Random Forest Classification
max_depth: 2
Coarse: 0.5022118874773139
Fine: 0.39462341197822143
AdaBoost Classification
n_estimators: 50
Coarse: 0.5183756805807622
Fine: 0.33325771324863884


In [26]:
print("Prediction using ONLY Continuous Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_con,Y_train_types)
    s = clf.score(X_test_con,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_con,Y_train)
    s = clf.score(X_test_con,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Continuous Features
Gaussian
Coarse Predictions: 0.3626361161524501
Fine Predictions: 0.04469147005444646
Multinomial
Coarse Predictions: 0.30473003629764067
Fine Predictions: 0.030399274047186932
Complement
Coarse Predictions: 0.48179446460980035
Fine Predictions: 0.3169804900181488


In [27]:
print("Prediction using ONLY Discrete Features")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    print(name)
    clf.fit(X_train_dis,Y_train_types)
    s = clf.score(X_test_dis,Y_test_types)
    print(f"Coarse Predictions: {s}")
    clf.fit(X_train_dis,Y_train)
    s = clf.score(X_test_dis,Y_test)
    print(f"Fine Predictions: {s}")

Prediction using ONLY Discrete Features
Gaussian
Coarse Predictions: 0.1847776769509982
Fine Predictions: 0.037942377495462795
Multinomial
Coarse Predictions: 0.42678085299455537
Fine Predictions: 0.16350952813067152
Complement
Coarse Predictions: 0.48060344827586204
Fine Predictions: 0.41129764065335755


In [28]:
clf_con, clf_dis = GaussianNB(), ComplementNB()
clf_con.fit(X_train_con,Y_train_types)
clf_dis.fit(X_train_dis,Y_train_types)
label_classes = clf_con.classes_
print("Mixture",np.mean(mix_naive_bayes([(clf_con,X_test_con),(clf_dis,X_test_dis)],len(X_test_con),label_classes) == Y_test_types))

[ 4  4  4 ... 32  4  4]
Mixture 0.47515880217785844
