In [1]:
import os
import sys
import random
import datetime
import itertools
import numpy as np
import pandas as pd
from zipUtil import zip_write, zip_read
from collections import defaultdict

In [2]:
compressed_data = 'timeseries_data_fixed2.pbz2'
timeseries_data = zip_read(compressed_data)
compressed_bo_data = 'buildOrder_data.pbz2'
bo_data = zip_read(compressed_bo_data)

In [3]:
sample_key = next(iter(bo_data))
sample_bo = bo_data[sample_key]
sample_timeseries = timeseries_data[sample_key]
print(sample_key)

1597403940_2305463


In [23]:
sample_bo.keys()

dict_keys(['race', 'matchup', 'data'])

In [5]:
print(sample_timeseries['data'].columns)

Index(['mineral_collection_rate', 'mineral_per_worker_rate',
       'mineral_queued_army', 'mineral_queued_economic',
       'mineral_queued_technology', 'mineral_spend', 'mineral_total_army',
       'mineral_total_economic', 'mineral_total_technology',
       'mineral_value_current_army', 'mineral_value_current_economic',
       'mineral_value_current_technology', 'minerals_available',
       'supply_available', 'supply_consumed', 'supply_utilization',
       'vespene_available', 'vespene_collection_rate', 'vespene_queued_army',
       'vespene_queued_economic', 'vespene_queued_technology', 'vespene_spend',
       'vespene_total_army', 'vespene_total_economic',
       'vespene_total_technology', 'vespene_value_current_army',
       'vespene_value_current_economic', 'vespene_value_current_technology',
       'worker_supply_ratio', 'workers_active'],
      dtype='object')


In [6]:
sample_timeseries['data'][["mineral_queued_army",'minerals_available',"vespene_queued_army",'vespene_available']][:20]

Unnamed: 0,mineral_queued_army,minerals_available,vespene_queued_army,vespene_available
0,0,50,0,0
1,0,35,0,0
2,0,70,0,0
3,0,45,0,0
4,0,45,0,0
5,0,140,0,0
6,0,25,0,0
7,0,10,0,0
8,0,40,0,0
9,0,105,0,0


In [7]:
print(sample_timeseries['data'].iloc[0])
print([(k,v) for k,v in sample_timeseries['data'].iloc[0].to_dict().items()])

mineral_collection_rate                0.0
mineral_per_worker_rate                0.0
mineral_queued_army                    0.0
mineral_queued_economic                0.0
mineral_queued_technology              0.0
mineral_spend                       1000.0
mineral_total_army                     0.0
mineral_total_economic              1000.0
mineral_total_technology               0.0
mineral_value_current_army             0.0
mineral_value_current_economic      1000.0
mineral_value_current_technology       0.0
minerals_available                    50.0
supply_available                      15.0
supply_consumed                       12.0
supply_utilization                     0.8
vespene_available                      0.0
vespene_collection_rate                0.0
vespene_queued_army                    0.0
vespene_queued_economic                0.0
vespene_queued_technology              0.0
vespene_spend                          0.0
vespene_total_army                     0.0
vespene_tot

In [8]:
# Pre-process Grab all the valid structure/action keys
# P_actions = set()
# T_actions = set()
# Z_actions = set()
# actions_set = {'P':P_actions,'T':T_actions,'Z':Z_actions}
# for _, data in bo_data.items():
#     for action in data['data']:
#         actions_set[data['race']].add(action['name'])

In [9]:
# Permissible set of actions for each race
# these are the classification target labels
from dataset_config import PROTOSS_ACTIONS, TERRAN_ACTIONS, ZERG_ACTIONS
ACTIONS = {'P': PROTOSS_ACTIONS, 'T': TERRAN_ACTIONS, 'Z': ZERG_ACTIONS}
CON_HEADERS = {race: ['Timestamp']+list(sample_timeseries['data']) for race in "PTZ"}
DIS_HEADERS = {race: ['P','T','Z']+list(ACTIONS[race].keys()) for race in "PTZ"}
STARTING_WORKERS = 12
GATHERER_NAMES = {'P': 'Probe', 'T': 'SCV', 'Z': 'Drone'}

In [10]:
# Iterate over events in bo_data
# 1) Accumulate discrete info on structures built
# 2) Extract nearest tick info from timeseries

'''
Structure:
# Continuous
|   Timestamp  |Timeseries|
|Sec_from_Start|f1,f2,f3..|
# Discrete
|Match|    Discrete    |
|P|T|Z|Structures+Units|
'''

# Helper function to add entries from one dictionary into another
def dictionary_add_to(to_dict, from_dict):
    for k in to_dict.keys():
        if (k not in from_dict):
            raise ValueError(f'{k} not in dictionary to be merged from')
        to_dict[k] += from_dict[k]
    
# We can extract multiple rows of [features]|[label] from a single game
def extract_nearest(key, data_dicts):
    race = bo_data[key]['race']
    matchup_other = [p for p in bo_data[key]['matchup'].split('v') if p != race]
    opponent = matchup_other[0] if len(matchup_other) else race
    bo = bo_data[key]['data']
    timeseries = timeseries_data[key]['data']
    
    # Grab global constants
    cur_con_headers = CON_HEADERS[race]
    cur_dis_headers = DIS_HEADERS[race]
    cur_actions = ACTIONS[race]
    
    # Data lists to hold stuff
    con_data = {k: [] for k in cur_con_headers}
    dis_data = {k: [] for k in cur_dis_headers}
    target_data = {'Target': []}
    
    # Accumulate discrete actions
    taken_actions = {k: 0 if k != GATHERER_NAMES[race] else STARTING_WORKERS for k in cur_actions.keys()}
    
    # Iterate through each action in bo
    last_slice = -1
    for action in bo:
        frame = action['frame']
        name = action['name']
        if (name not in taken_actions):
            continue
        # Check which slice of timeseries to take
        current_slice = int(frame//160) # SLICE SIZE = 16f/s*10s [CONSTANT!]
        if (current_slice == last_slice): # Ignore subsequent actions [predict first only]
            # Accumulate to actions taken
            taken_actions[name] += 1
            continue
        last_slice = current_slice
        # Record one datapoint row in con/dis df
        con_data['Timestamp'].append(current_slice)
        for k, v in sample_timeseries['data'].iloc[current_slice].to_dict().items():
            con_data[k].append(v)
        for p in "PTZ":
            if (p == opponent):
                dis_data[p].append(1)
            else:
                dis_data[p].append(0)
        for k, v in taken_actions.items():
            dis_data[k].append(v)
        label = cur_actions[name]
        target_data['Target'].append(label)
        # Accumulate to actions taken
        taken_actions[name] += 1
    
    # Sanity check our data before merging
    num_datapoints = len(target_data['Target'])
    if (not all(map(lambda k: len(con_data[k]) == num_datapoints,con_data))):
        print(con_data)
        raise ValueError(f"[Extraction: {key}] Dataframe dimensions mismatch in CON_DATA")
    if (not all(map(lambda k: len(dis_data[k]) == num_datapoints,dis_data))):
        print(dis_data)
        raise ValueError(f"[Extraction: {key}] Dataframe dimensions mismatch in DIS_DATA")
    if (not all(map(lambda k: k in con_data,data_dicts[race][0]))):
        print(con_data.keys())
        raise ValueError(f"[Extraction: {key}] Datafrome CON_DATA missing keys")
    if (not all(map(lambda k: k in dis_data,data_dicts[race][1]))):
        print(dis_data.keys())
        raise ValueError(f"[Extraction: {key}] Datafrome DIS_DATA missing keys")
    if (not all(map(lambda k: k in target_data,data_dicts[race][2]))):
        print(target_data.keys())
        raise ValueError(f"[Extraction: {key}] Datafrome TARGET_DATA missing keys")
    
    # Merge with bigger data
    dictionary_add_to(data_dicts[race][0],con_data)
    dictionary_add_to(data_dicts[race][1],dis_data)
    dictionary_add_to(data_dicts[race][2],target_data)
    return race, con_data, dis_data, target_data

In [11]:
# [Continuous, Discrete, Label] dictionaries
df_data = {race: [{k:[] for k in CON_HEADERS[race]},{k:[] for k in DIS_HEADERS[race]},{'Target':[]}] for race in "PTZ"}
race_count = defaultdict(int)
print(len(bo_data.keys()))
# for key in random.sample(list(bo_data.keys()), 100):
#     print(key,bo_data[key]['matchup'])
for key in bo_data.keys():
    sample_extracted = extract_nearest(key, df_data)
    race_count[sample_extracted[0]] += 1
print(race_count)

3958
defaultdict(<class 'int'>, {'P': 1549, 'T': 1135, 'Z': 1274})


In [12]:
def debug_extraction(datapoints):
    print(datapoints[0])
    con_headers = ['Timestamp','minerals_available','vespene_available']
    dis_headers = ['P','T','Z']
    output = {**{k: datapoints[1][k] for k in con_headers}, **{k: datapoints[2][k] for k in dis_headers}, **datapoints[3]}
    print(output)
    return pd.DataFrame(output, columns = con_headers+dis_headers+['Target'])

In [13]:
debug_extraction(sample_extracted)

Z
{'Timestamp': [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 21, 22, 23, 24, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 68], 'minerals_available': [50.0, 35.0, 70.0, 45.0, 25.0, 10.0, 40.0, 105.0, 10.0, 65.0, 20.0, 50.0, 100.0, 100.0, 95.0, 200.0, 310.0, 270.0, 370.0, 145.0, 260.0, 160.0, 250.0, 155.0, 270.0, 295.0, 370.0, 95.0, 130.0, 245.0, 385.0, 130.0, 130.0, 250.0, 340.0, 465.0, 560.0, 240.0, 70.0, 205.0, 290.0, 390.0, 35.0, 120.0, 105.0, 70.0, 155.0, 250.0, 305.0, 200.0, 245.0, 170.0, 170.0, 115.0, 375.0, 495.0, 385.0, 380.0, 465.0, 740.0], 'vespene_available': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 16.0, 52.0, 88.0, 168.0, 94.0, 30.0, 100.0, 140.0, 26.0, 62.0, 142.0, 182.0, 122.0, 194.0, 234.0, 274.0, 310.0, 350.0, 136.0, 176.0, 216.0, 256.0, 146.0, 178.0, 218.0, 258.0, 298.0, 334.0, 374.0, 110.0, 150.0, 190.0, 230.0, 270.0, 306.0, 346.0, 382.0, 422

Unnamed: 0,Timestamp,minerals_available,vespene_available,P,T,Z,Target
0,0,50.0,0.0,0,0,1,9
1,1,35.0,0.0,0,0,1,31
2,2,70.0,0.0,0,0,1,9
3,4,45.0,0.0,0,0,1,9
4,6,25.0,0.0,0,0,1,17
5,7,10.0,0.0,0,0,1,9
6,8,40.0,0.0,0,0,1,9
7,9,105.0,0.0,0,0,1,13
8,10,10.0,4.0,0,0,1,39
9,11,65.0,16.0,0,0,1,9


In [14]:
con_dfs = {race: pd.DataFrame(df_data[race][0],columns=CON_HEADERS[race]) for race in "PTZ"}
dis_dfs = {race: pd.DataFrame(df_data[race][1],columns=DIS_HEADERS[race]) for race in "PTZ"}
target_dfs = {race: pd.DataFrame(df_data[race][2],columns=['Target']) for race in "PTZ"}

In [15]:
# pickle and save the dictionaries
zip_write('dataframes_continuous',con_dfs)
zip_write('dataframes_discrete',dis_dfs)
zip_write('dataframes_target',target_dfs)

In [16]:
con_dfs['P']

Unnamed: 0,Timestamp,mineral_collection_rate,mineral_per_worker_rate,mineral_queued_army,mineral_queued_economic,mineral_queued_technology,mineral_spend,mineral_total_army,mineral_total_economic,mineral_total_technology,...,vespene_queued_technology,vespene_spend,vespene_total_army,vespene_total_economic,vespene_total_technology,vespene_value_current_army,vespene_value_current_economic,vespene_value_current_technology,worker_supply_ratio,workers_active
0,0,0.0,0.000000,0.0,0.0,0.0,1000.0,0.0,1000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,12.0
1,1,293.0,24.416667,0.0,50.0,0.0,1000.0,0.0,1050.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.923077,12.0
2,2,671.0,51.615385,0.0,50.0,0.0,1050.0,0.0,1100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
3,3,671.0,51.615385,0.0,150.0,0.0,1050.0,0.0,1200.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
4,5,783.0,55.928571,0.0,50.0,0.0,1200.0,0.0,1250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.933333,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100804,85,2071.0,42.265306,25.0,450.0,0.0,9175.0,2200.0,5150.0,2300.0,...,0.0,1300.0,1025.0,0.0,350.0,950.0,0.0,350.0,0.556818,49.0
100805,88,2071.0,38.351852,0.0,400.0,0.0,9800.0,2200.0,5700.0,2300.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.574468,54.0
100806,89,2211.0,40.200000,0.0,400.0,300.0,9850.0,2200.0,5750.0,2600.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.578947,55.0
100807,90,2267.0,39.771930,0.0,125.0,300.0,10125.0,2200.0,5750.0,2600.0,...,0.0,1375.0,1025.0,0.0,350.0,1025.0,0.0,350.0,0.600000,57.0


In [17]:
con_dfs['T']

Unnamed: 0,Timestamp,mineral_collection_rate,mineral_per_worker_rate,mineral_queued_army,mineral_queued_economic,mineral_queued_technology,mineral_spend,mineral_total_army,mineral_total_economic,mineral_total_technology,...,vespene_queued_technology,vespene_spend,vespene_total_army,vespene_total_economic,vespene_total_technology,vespene_value_current_army,vespene_value_current_economic,vespene_value_current_technology,worker_supply_ratio,workers_active
0,0,0.0,0.000000,0.0,0.0,0.0,1000.0,0.0,1000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,12.0
1,1,293.0,24.416667,0.0,50.0,0.0,1000.0,0.0,1050.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.923077,12.0
2,2,671.0,51.615385,0.0,50.0,0.0,1050.0,0.0,1100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
3,3,671.0,51.615385,0.0,150.0,0.0,1050.0,0.0,1200.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
4,5,783.0,55.928571,0.0,50.0,0.0,1200.0,0.0,1250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.933333,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82315,45,1119.0,36.096774,0.0,450.0,0.0,4500.0,900.0,3050.0,1000.0,...,0.0,700.0,450.0,0.0,250.0,450.0,0.0,250.0,0.659574,31.0
82316,46,1119.0,36.096774,0.0,550.0,0.0,4650.0,1050.0,3150.0,1000.0,...,0.0,1000.0,750.0,0.0,250.0,750.0,0.0,250.0,0.584906,31.0
82317,47,1091.0,34.093750,0.0,550.0,0.0,4700.0,1050.0,3200.0,1000.0,...,0.0,1000.0,750.0,0.0,250.0,750.0,0.0,250.0,0.592593,32.0
82318,48,1119.0,34.968750,0.0,550.0,0.0,4700.0,1050.0,3200.0,1000.0,...,0.0,1000.0,750.0,0.0,250.0,750.0,0.0,250.0,0.592593,32.0


In [18]:
con_dfs['Z']

Unnamed: 0,Timestamp,mineral_collection_rate,mineral_per_worker_rate,mineral_queued_army,mineral_queued_economic,mineral_queued_technology,mineral_spend,mineral_total_army,mineral_total_economic,mineral_total_technology,...,vespene_queued_technology,vespene_spend,vespene_total_army,vespene_total_economic,vespene_total_technology,vespene_value_current_army,vespene_value_current_economic,vespene_value_current_technology,worker_supply_ratio,workers_active
0,0,0.0,0.000000,0.0,0.0,0.0,1000.0,0.0,1000.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,12.0
1,1,293.0,24.416667,0.0,50.0,0.0,1000.0,0.0,1050.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.923077,12.0
2,2,671.0,51.615385,0.0,50.0,0.0,1050.0,0.0,1100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.928571,13.0
3,4,755.0,53.928571,0.0,150.0,0.0,1100.0,0.0,1250.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.933333,14.0
4,6,783.0,52.200000,0.0,50.0,150.0,1250.0,0.0,1300.0,150.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.937500,15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88151,63,2267.0,48.234043,0.0,0.0,300.0,7175.0,1425.0,4400.0,1650.0,...,0.0,1150.0,900.0,0.0,250.0,900.0,0.0,250.0,0.635135,47.0
88152,64,2211.0,47.042553,0.0,0.0,450.0,7550.0,1800.0,4400.0,1800.0,...,0.0,1300.0,1050.0,0.0,250.0,1050.0,0.0,250.0,0.587500,47.0
88153,65,2099.0,44.659574,0.0,0.0,700.0,7550.0,1800.0,4400.0,2050.0,...,0.0,1300.0,1050.0,0.0,250.0,1050.0,0.0,250.0,0.587500,47.0
88154,67,2015.0,42.872340,375.0,0.0,700.0,7425.0,2050.0,4400.0,2050.0,...,0.0,1250.0,1150.0,0.0,250.0,1000.0,0.0,250.0,0.559524,47.0
