In [23]:
import numpy as np
import scipy.sparse as sp
import tensorflow as tf
import time
from multiprocessing import Process
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from spektral.data import Dataset, DisjointLoader, Graph
from spektral.layers import GCSConv, GlobalAvgPool
from spektral.transforms.normalize_adj import NormalizeAdj

In [22]:
print("Number of CPU:", multiprocessing.cpu_count())

Number of CPU: 40


In [2]:
import pandas as pd
adj_mat = pd.read_pickle('adj_mat.pkl')

In [3]:
adj_mat[('202109090097', 6.0)].shape

(23, 23)

In [4]:
edge_feat = pd.read_pickle('edge_features.pkl')

In [5]:
edge_feat[('202109090097', 6.0)].shape

(23, 23)

In [6]:
edge_feat = adj_mat * edge_feat

In [7]:
y = pd.read_pickle('y.pkl')

In [8]:
y = pd.Series(y['pff_passCoverage'])

In [9]:
y = y.groupby(['uniqueplayId', 'frameId']).first()

In [10]:
y[('202109090097', 6.0)]

'Cover-1'

In [11]:
node_feat = pd.read_pickle('node_features.pkl')

In [12]:
node_feat = node_feat.reset_index()

In [13]:
node_feat['uniqueplayId'] = node_feat['uniqueplayId'].astype(int)
node_feat = node_feat.set_index(['uniqueplayId','frameId','nflId'])

In [14]:
node_feat.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quarter,down,yardsToGo,defensiveTeam,yardlineNumber,absoluteYardlineNumber,s,o,new_x,new_y,Defense,score_d,frames_after_snap
uniqueplayId,frameId,nflId,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
202109090097,6.0,25511.0,1,3,2,DAL,33,43.0,0.35,215.58,3.92,-0.34,0,0,0
202109090097,7.0,25511.0,1,3,2,DAL,33,43.0,0.54,222.51,4.0,-0.34,0,0,1
202109090097,8.0,25511.0,1,3,2,DAL,33,43.0,0.8,228.05,4.09,-0.33,0,0,2
202109090097,9.0,25511.0,1,3,2,DAL,33,43.0,0.99,230.15,4.18,-0.32,0,0,3
202109090097,10.0,25511.0,1,3,2,DAL,33,43.0,1.19,236.21,4.29,-0.31,0,0,4


In [15]:
node_feat.query('(uniqueplayId=="202109090097")&(frameId==6.0)').shape

(23, 13)

In [16]:
edge_feat.index[:5]

MultiIndex([('20210909001078',  6.0),
            ('20210909001078',  7.0),
            ('20210909001078',  8.0),
            ('20210909001078',  9.0),
            ('20210909001078', 10.0)],
           names=['uniqueplayId', 'frameId'])

In [28]:
def make_graph(index):
    ## PLAY_ID MUST BE A STRING
    
    play_id = index[0]
    frame_id = index[1]

    # Node features
    ## filter the node features matrix by given play id and frame id
    x_temp = node_feat.query(f'(uniqueplayId=={play_id})&(frameId=={frame_id})')
    x_temp = np.array(x_temp)
    #print(x_temp.shape)

    # Adjacency
    a_temp = adj_mat[(play_id, frame_id)]
    a_temp = np.array(a_temp)
    #print(a_temp.shape)
    
    # Edges
    ## get the correct edge matrix based on play id and frame id
    e_temp = edge_feat[(play_id, frame_id)]
    e_temp = np.array(e_temp)
    #print(e_temp.shape)

    # Labels
    ## get the single label of coverage from y for that play id
    y_temp = y[(play_id, frame_id)]
    y_temp = np.array(y_temp)
    #print(y_temp.shape)

    return Graph(x=x_temp, a=a_temp, e=e_temp, y=y_temp)

In [18]:
make_graph('202109090097', 6.0)

Graph(n_nodes=23, n_node_features=13, n_edge_features=23, n_labels=1)

In [19]:
len(edge_feat)

188724

In [29]:
all_graphs = []
counter = 0

for i in indeces:
    counter += 1
    if counter % 10 == 0:
        print(counter)

    graph = make_graph(i)
    all_graphs.append(graph)

10
20
30


KeyboardInterrupt: 

In [None]:
if __name__ == "__main__":
    indeces = edge_feat.index
    procs = []
    all_graphs = []
    counter = 0
    
    for i in indeces:
        counter += 1
        if counter % 10 == 0:
            print(counter)

        proc = Process(target=make_graph, args=(i[0], i[1],))
        
        all_graphs.append(graph)

In [26]:
indeces = edge_feat.index
indeces

MultiIndex([('20210909001078',  6.0),
            ('20210909001078',  7.0),
            ('20210909001078',  8.0),
            ('20210909001078',  9.0),
            ('20210909001078', 10.0),
            ('20210909001078', 11.0),
            ('20210909001078', 12.0),
            ('20210909001078', 13.0),
            ('20210909001078', 14.0),
            ('20210909001078', 15.0),
            ...
            ( '2021102500933', 29.0),
            ( '2021102500933', 30.0),
            ( '2021102500933', 31.0),
            ( '2021102500933', 32.0),
            ( '2021102500933', 33.0),
            ( '2021102500933', 34.0),
            ( '2021102500933', 35.0),
            ( '2021102500933', 36.0),
            ( '2021102500933', 37.0),
            ( '2021102500933', 38.0)],
           names=['uniqueplayId', 'frameId'], length=188724)

In [30]:
%%time

from concurrent.futures import ProcessPoolExecutor

all_graphs = []
counter = 0
indeces = edge_feat.index

with ProcessPoolExecutor(max_workers=40) as executor:
    for r in executor.map(make_graph, indeces):
        all_graphs.append(r)

CPU times: user 2min 17s, sys: 29.8 s, total: 2min 47s
Wall time: 22min 36s


In [33]:
all_graphs.to_pickle('graph_data.pkl')

AttributeError: 'list' object has no attribute 'to_pickle'

In [37]:
len(all_graphs)

188724

In [40]:
import pickle

In [41]:
with open('blah.pickle', 'wb') as b:
    pickle.dump(all_graphs, b)

In [42]:
graph_data = pd.read_pickle('blah.pickle')

In [None]:
graph_data[]