In [1]:
import os
import sys
import random
import datetime
import itertools
import numpy as np
import pandas as pd
from zipUtil import zip_write, zip_read
from collections import defaultdict

In [2]:
from dataset_config import decode_type, PROTOSS_ACTIONS, PROTOSS_ACTIONS_TYPE, TERRAN_ACTIONS, ZERG_ACTIONS
ACTIONS = {'P': PROTOSS_ACTIONS, 'T': TERRAN_ACTIONS, 'Z': ZERG_ACTIONS}
rev_ACTIONS = {race: {v:k for k,v in ACTIONS[race].items()} for race in 'PTZ'}
GATHERER_NAMES = {'P': 'Probe', 'T': 'SCV', 'Z': 'Drone'}

In [3]:
# read the dataset dictionaries
con_dfs = zip_read('dataframes_continuous')
dis_dfs = zip_read('dataframes_discrete')
target_dfs = zip_read('dataframes_target')

In [4]:
# Clean up some useless columns
for race in "PTZ":
    con_dfs[race] = con_dfs[race].drop(columns=['vespene_queued_economic','vespene_total_economic','vespene_value_current_economic'])

In [99]:
# Remove worker label datapoints
trimmed_con_dfs = {}
trimmed_dis_dfs = {}
trimmed_target_dfs = {}
for race in "PTZ":
    worker_id = ACTIONS[race][GATHERER_NAMES[race]]
    ignore_rows = target_dfs[race].index[target_dfs[race]['Target'] == worker_id].tolist()
    trimmed_con_dfs[race] = con_dfs[race].drop(ignore_rows)
    trimmed_dis_dfs[race] = dis_dfs[race].drop(ignore_rows)
    trimmed_target_dfs[race] = target_dfs[race].drop(ignore_rows)

In [164]:
# Run classification for race \in {'P','T','Z'}
race = 'T'

In [166]:
# Target Dataset Occurance rates
target_priors = defaultdict(int)
num_entries = len(target_dfs[race])
for idx,row in target_dfs[race].iterrows():
    target_priors[row['Target']] += 1
sorted_actions = []
for k, v in target_priors.items():
    sorted_actions.append((v/num_entries,v,rev_ACTIONS[race][k]))
sorted_actions = sorted(sorted_actions,reverse=True)
print(len(sorted_actions))
for ratio, num, action in sorted_actions:
    print(f"{action}\t{ratio:.5f}\t {num}")
    if (ratio < 0.001):
        break

61
SCV	0.35185	 28964
Marine	0.20933	 17232
SupplyDepot	0.08277	 6814
Marauder	0.03071	 2528
RefineryRich	0.02960	 2437
SiegeTank	0.02884	 2374
CommandCenter	0.02291	 1886
Medivac	0.02253	 1855
Barracks	0.02148	 1768
OrbitalCommand	0.02126	 1750
Hellion	0.02072	 1706
WidowMine	0.01379	 1135
Factory	0.01303	 1073
Reaper	0.01282	 1055
VikingFighter	0.01231	 1013
BarracksReactor	0.01165	 959
MissileTurret	0.00839	 691
Starport	0.00788	 649
EngineeringBay	0.00702	 578
FactoryTechLab	0.00669	 551
BarracksTechLab	0.00637	 524
Bunker	0.00572	 471
Cyclone	0.00496	 408
TerranInfantryWeaponsLevel1	0.00373	 307
Stimpack	0.00355	 292
Raven	0.00340	 280
StarportReactor	0.00333	 274
Armory	0.00327	 269
Combat Shield	0.00301	 248
Liberator	0.00295	 243
Ghost	0.00283	 233
FactoryReactor	0.00237	 195
StarportTechLab	0.00234	 193
TerranInfantryArmorsLevel1	0.00194	 160
SensorTower	0.00192	 158
Banshee	0.00172	 142
Concussive Shells	0.00138	 114
TerranInfantryWeaponsLevel2	0.00123	 101
PlanetaryFortress	

In [19]:
con_dfs[race]

Unnamed: 0,Timestamp,mineral_collection_rate,mineral_per_worker_rate,mineral_queued_army,mineral_queued_economic,mineral_queued_technology,mineral_spend,mineral_total_army,mineral_total_economic,mineral_total_technology,...,vespene_queued_technology,vespene_spend,vespene_total_army,vespene_total_economic,vespene_total_technology,vespene_value_current_army,vespene_value_current_economic,vespene_value_current_technology,worker_supply_ratio,workers_active
2,2,671.0,51.615385,0.0,50.0,0.0,1050.0,0.0,1100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
8,9,923.0,51.277778,0.0,200.0,150.0,1400.0,0.0,1600.0,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.947368,18.0
10,11,867.0,45.631579,0.0,50.0,300.0,1600.0,0.0,1650.0,300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.950000,19.0
12,13,755.0,35.952381,0.0,50.0,300.0,1850.0,0.0,1750.0,450.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.954545,21.0
14,16,923.0,40.130435,0.0,100.0,150.0,2100.0,0.0,1900.0,450.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100804,85,2071.0,42.265306,25.0,450.0,0.0,9175.0,2200.0,5150.0,2300.0,...,0.0,1300.0,1025.0,0.0,350.0,950.0,0.0,350.0,0.556818,49.0
100805,88,2071.0,38.351852,0.0,400.0,0.0,9800.0,2200.0,5700.0,2300.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.574468,54.0
100806,89,2211.0,40.200000,0.0,400.0,300.0,9850.0,2200.0,5750.0,2600.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.578947,55.0
100807,90,2267.0,39.771930,0.0,125.0,300.0,10125.0,2200.0,5750.0,2600.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.600000,57.0


In [38]:
dis_dfs[race]

Unnamed: 0,P,T,Z,Adept,Anion Pulse-Crystals,Archon,Assimilator,AssimilatorRich,Blink,Carrier,...,ShieldBattery,Stalker,Stargate,Tempest,TemplarArchive,TwilightCouncil,VoidRay,Warp Gate,WarpPrism,Zealot
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,1,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,1,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,1,0,0,0,0,0,2,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100804,0,0,1,2,0,6,4,0,0,0,...,1,1,0,0,0,1,0,1,1,40
100805,0,0,1,2,0,8,4,0,0,0,...,1,1,0,0,0,1,0,1,1,43
100806,0,0,1,2,0,8,4,0,0,0,...,1,1,0,0,0,1,0,1,1,44
100807,0,0,1,2,0,9,4,0,0,0,...,1,1,0,0,0,1,0,1,1,45


In [168]:
from sklearn.feature_selection import SelectKBest, chi2
print("===== CONTINUOUS =====")
X, y = con_dfs[race], target_dfs[race]['Target'].values.tolist()
X_new = SelectKBest().fit(X,y)
feature_names = list(con_dfs[race])
feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
for name, score in feature_scores:
    print(f"{name}: {score:.3f}")
# print("===== DISCRETE =====")
# X, y = dis_dfs['P'], target_dfs['P']['Target'].values.tolist()
# X_new = SelectKBest().fit(X,y)
# feature_names = list(X)
# feature_scores = sorted([(feature_names[i], X_new.scores_[i]) for i in range(len(feature_names))],key=lambda x:x[1],reverse=True)
# for name, score in feature_scores:
#     print(f"{name}: {score:.3f}")

===== CONTINUOUS =====
Timestamp: 586.358
mineral_value_current_technology: 550.929
mineral_total_technology: 548.076
supply_available: 541.990
mineral_spend: 541.893
mineral_total_economic: 528.978
mineral_value_current_economic: 520.374
worker_supply_ratio: 509.133
supply_consumed: 504.091
vespene_spend: 481.874
mineral_value_current_army: 481.724
workers_active: 473.471
mineral_total_army: 468.525
vespene_value_current_army: 466.016
vespene_total_army: 447.461
vespene_value_current_technology: 443.077
mineral_collection_rate: 439.701
vespene_total_technology: 405.485
vespene_collection_rate: 255.925
minerals_available: 228.791
vespene_available: 141.111
supply_utilization: 96.324
vespene_queued_army: 60.151
mineral_queued_technology: 57.260
mineral_queued_army: 40.459
mineral_queued_economic: 39.466
vespene_queued_technology: 39.151
mineral_per_worker_rate: 34.732


In [170]:
print(list(con_dfs[race]))
con_feature_subset = [
    'Timestamp',
    'mineral_spend',
    'mineral_value_current_army',
    'mineral_value_current_economic',
    'mineral_value_current_technology',
    'supply_available',
    'supply_consumed',
    'vespene_collection_rate',
    'vespene_value_current_army',
    'vespene_value_current_technology'
]

['Timestamp', 'mineral_collection_rate', 'mineral_per_worker_rate', 'mineral_queued_army', 'mineral_queued_economic', 'mineral_queued_technology', 'mineral_spend', 'mineral_total_army', 'mineral_total_economic', 'mineral_total_technology', 'mineral_value_current_army', 'mineral_value_current_economic', 'mineral_value_current_technology', 'minerals_available', 'supply_available', 'supply_consumed', 'supply_utilization', 'vespene_available', 'vespene_collection_rate', 'vespene_queued_army', 'vespene_queued_technology', 'vespene_spend', 'vespene_total_army', 'vespene_total_technology', 'vespene_value_current_army', 'vespene_value_current_technology', 'worker_supply_ratio', 'workers_active']


In [171]:
X = pd.concat([trimmed_con_dfs[race][con_feature_subset], trimmed_dis_dfs[race]],axis=1)
Y = trimmed_target_dfs[race]['Target'].values.tolist()
X_full = pd.concat([con_dfs[race][con_feature_subset],dis_dfs[race]],axis=1)
Y_full = target_dfs[race]['Target'].values.tolist()

In [174]:
X

Unnamed: 0,Timestamp,mineral_spend,mineral_value_current_army,mineral_value_current_economic,mineral_value_current_technology,supply_available,supply_consumed,vespene_collection_rate,vespene_value_current_army,vespene_value_current_technology,...,TerranShipWeaponsLevel2,TerranVehicleAndShipArmorsLevel1,TerranVehicleAndShipArmorsLevel2,TerranVehicleWeaponsLevel1,TerranVehicleWeaponsLevel2,TerranVehicleWeaponsLevel3,Thor,VikingFighter,Weapon Refit,WidowMine
2,2,1050.0,0.0,1050.0,0.0,15.0,14.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,6,1250.0,0.0,1250.0,0.0,23.0,16.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,12,1800.0,0.0,1650.0,150.0,23.0,21.0,246.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10,13,1850.0,0.0,1700.0,150.0,23.0,22.0,291.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
11,14,1850.0,0.0,1700.0,150.0,23.0,22.0,313.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82307,37,4175.0,525.0,2650.0,1000.0,54.0,37.0,313.0,300.0,250.0,...,0,0,0,0,0,0,0,1,0,0
82310,40,4250.0,900.0,2350.0,1000.0,47.0,44.0,313.0,450.0,250.0,...,0,0,0,0,0,0,0,1,0,0
82312,42,4300.0,900.0,2400.0,1000.0,47.0,45.0,335.0,450.0,250.0,...,0,0,0,0,0,0,0,1,0,0
82313,43,4400.0,900.0,2500.0,1000.0,55.0,45.0,335.0,450.0,250.0,...,0,0,0,0,0,0,0,1,0,0


In [93]:
# Normalizing continuous data
from sklearn import preprocessing
x = con_dfs['P'].values
min_max_scaler = preprocessing.MinMaxScaler()
x_norm = min_max_scaler.fit_transform(x)
con_dfs_P_norm = pd.DataFrame(x_norm)
X_norm = pd.concat([con_dfs_P_norm, dis_dfs['P']],axis=1)

In [6]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB

In [175]:
print("Trimmed dataset (no Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(trimmed_con_dfs[race][:-5000],Y[:-5000])
    s = clf.score(trimmed_con_dfs[race][-5000:],Y[-5000:])
    print(f"{name}: {s}")
print("Full dataset (with Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(con_dfs[race][:-5000],Y_full[:-5000])
    s = clf.score(con_dfs[race][-5000:],Y_full[-5000:])
    print(f"{name}: {s}")

Trimmed dataset (no Workers):
Gaussian: 0.047
Multinomial: 0.0274
Complement: 0.3176
Full dataset (with Workers):
Gaussian: 0.055
Multinomial: 0.0192
Complement: 0.4118


In [176]:
print("Subset on CON_DF")
print("Trimmed dataset (no Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(trimmed_con_dfs[race][con_feature_subset][:-5000],Y[:-5000])
    s = clf.score(trimmed_con_dfs[race][con_feature_subset][-5000:],Y[-5000:])
    print(f"{name}: {s}")
print("Full dataset (with Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(con_dfs[race][con_feature_subset][:-5000],Y_full[:-5000])
    s = clf.score(con_dfs[race][con_feature_subset][-5000:],Y_full[-5000:])
    print(f"{name}: {s}")

Subset on CON_DF
Trimmed dataset (no Workers):
Gaussian: 0.0446
Multinomial: 0.0366
Complement: 0.3164
Full dataset (with Workers):
Gaussian: 0.1088
Multinomial: 0.0228
Complement: 0.412


In [177]:
print("Prediction using ONLY Discrete Features")
print("Trimmed dataset (no Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(trimmed_dis_dfs[race][:-5000],Y[:-5000])
    s = clf.score(trimmed_dis_dfs[race][-5000:],Y[-5000:])
    print(f"{name}: {s}")
print("Full dataset (with Workers):")
for clf, name in zip([GaussianNB(),MultinomialNB(),ComplementNB()], ['Gaussian', 'Multinomial', 'Complement']):
    clf.fit(dis_dfs[race][:-5000],Y_full[:-5000])
    s = clf.score(dis_dfs[race][-5000:],Y_full[-5000:])
    print(f"{name}: {s}")

Prediction using ONLY Discrete Features
Trimmed dataset (no Workers):
Gaussian: 0.0494
Multinomial: 0.2074
Complement: 0.3218
Full dataset (with Workers):
Gaussian: 0.0266
Multinomial: 0.1902
Complement: 0.4428


In [178]:
# Combining the two modalities
full_index = target_dfs[race].index.tolist()
# Shuffle into random order
random.seed(0)
random.shuffle(full_index)
# Cut into 80/20 split
cutoff = int(0.80*len(full_index))
train_index = full_index[:cutoff]
test_index = full_index[cutoff:]

X_train_con, X_train_dis = con_dfs[race].loc[train_index], dis_dfs[race].loc[train_index]
X_test_con, X_test_dis = con_dfs[race].loc[test_index], dis_dfs[race].loc[test_index]
Y_train = target_dfs[race].loc[train_index]
Y_test = target_dfs[race].loc[test_index]

In [203]:
clf_con, clf_dis = ComplementNB(), ComplementNB()
clf_con.fit(X_train_con,Y_train['Target'].values.tolist())
clf_dis.fit(X_train_dis,Y_train['Target'].values.tolist())
# np.set_printoptions(suppress=True)
# print(clf_con.class_prior_)

ComplementNB()

In [188]:
def mix_naive_bayes(modalities,n_samples,classes):
    probs = np.ones((n_samples,len(classes)))
    for clf, X_test in modalities:
        probs *= clf.predict_proba(X_test)
    result = np.array([classes[i] for i in np.argmax(probs,axis=1)])
    print(result)
    return result

In [189]:
label_classes = clf_con.classes_
print(label_classes)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 25 26 27 28 29 30 31 32 33 34 35 36 37 39 40 41 42 43 44 45 46 47 48 49
 50 51 52 53 54 56 57 58 59 61 62 63 64]


In [200]:
# print("Continuous:",clf_con.score(X_test_con,Y_test),np.mean(clf_con.classes_[np.argmax(clf_con._joint_log_likelihood(X_test_con),axis=1)] == Y_test.to_numpy()))
# print("Discrete:",clf_dis.score(X_test_dis,Y_test),np.mean(np.array([label_classes[i] for i in np.argmax(clf_dis.predict_proba(X_test_dis),axis=1)]) == Y_test.to_numpy()))
print(clf_con.score(X_test_con,Y_test),np.mean(clf_con.predict(X_test_con) == Y_test['Target'].to_numpy()))

0.061892614188532556 0.061892614188532556


In [204]:
print("Mixture",np.mean(mix_naive_bayes([(clf_con,X_test_con),(clf_dis,X_test_dis)],len(X_test_con),label_classes) == Y_test['Target'].to_numpy()))
print("Continous",np.mean(mix_naive_bayes([(clf_con,X_test_con)],len(X_test_con),label_classes) == Y_test['Target'].to_numpy()))
print("Discrete",np.mean(mix_naive_bayes([(clf_dis,X_test_dis)],len(X_test_dis),label_classes) == Y_test['Target'].to_numpy()))

[39 39 39 ... 29 39 29]
Mixture 0.3924319727891156
[39 39 39 ... 29 39 29]
Continous 0.3923712342079689
[39 39 39 ... 39 39 39]
Discrete 0.4265063168124393


In [25]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

In [80]:
kernel = 1.0 * RBF(5.0)
train = sorted(random.sample(X_full.index.tolist(),300))
test = sorted(random.sample(X_full.index.tolist(),20))
gpc = GaussianProcessClassifier(kernel=kernel).fit(X_full.loc[train],[Y_full[i] for i in train])
gpc.score(X_full.loc[test],[Y_full[i] for i in test])



0.3