In [20]:
import re
from collections import defaultdict

In [21]:
KB_ID = 'concepts'

### Setup Text Extraction Method

In [22]:
import nltk
from extr.entities import create_entity_extractor, KnowledgeBaseEntityLinker, EntityAnnotator
from extr.relations import RelationExtractor
from extr.entities.context import ConText

from labels.entity_patterns import patterns
from labels.kb import kb as kb_patterns
from labels.relation_patterns import relation_patterns
from knowledge.kb import kb
from rules.context import rule_grouping

entity_extractor = create_entity_extractor(patterns, kb_patterns)
relation_extractor = RelationExtractor(relation_patterns)
conText = ConText(
    rule_grouping=rule_grouping,
    word_tokenizer=nltk.tokenize.word_tokenize
)
entity_linker = KnowledgeBaseEntityLinker(
    attribute_label=KB_ID,
    kb=kb
)
entity_annotator = EntityAnnotator()

def extract_data(instance: str):
    entities = conText.apply(
        instance,
        entity_linker.link(
            entity_extractor.get_entities(instance)
        ),
        filter_out_rule_labels=True
    )
    
    relations = relation_extractor.extract(
        entity_annotator.annotate(instance, entities),
        entities
    )

    return entities, relations

### Extract Entities / Relations / Identify Context / Link Entities

In [23]:
instances = []
with open('../data/2/2022_w1_g401437632.txt', 'r') as source_file:
    instances = source_file.read().split('\n')

In [24]:
observations = []
for instance in instances:
    entities, relations = extract_data(instance)

    observation = {
        'entities': entities,
        'stats': {
            'n_rels': len(relations),
            'n_entities': len(entities)
        }
    }

    for relation in relations:
        definition = relation.definition
        if not definition in observation:
            observation[definition] = []

        observation[definition].append(relation)

    observations.append(observation)

observations[0]

{'entities': [<Entity label="QUANTITY" text="25" span=(85, 87)>,
  <Entity label="QUANTITY" text="22" span=(78, 80)>,
  <Entity label="TEAM" text="NYJ" span=(74, 77)>,
  <Entity label="QUANTITY" text="-3" span=(57, 59)>,
  <Entity label="TEAM" text="NYJ" span=(53, 56)>,
  <Entity label="QUANTITY" text="35" span=(47, 49)>,
  <Entity label="TEAM" text="BLT" span=(43, 46)>,
  <Entity label="QUANTITY" text="68" span=(29, 31)>,
  <Entity label="PERIOD" text="1st" span=(9, 12)>,
  <Entity label="TIME" text="15:00" span=(1, 6)>],
 'stats': {'n_rels': 4, 'n_entities': 10},
 'r("PERIOD", "TIME")': [<Relation e1="1st" r="is_at" e2="15:00">],
 'r("TEAM", "QUANTITY")': [<Relation e1="BLT" r="is_spot_of_ball" e2="35">,
  <Relation e1="NYJ" r="is_spot_of_ball" e2="-3">,
  <Relation e1="NYJ" r="is_spot_of_ball" e2="22">]}

### Group Events by Period and Time

In [25]:
from collections import defaultdict

group_by_period = defaultdict(list)

is_at_key = 'r("PERIOD", "TIME")'

for observation in observations:
    if not is_at_key in observation:
        continue

    is_at = observation[is_at_key][0]
    e1 = is_at.e1
    e2 = is_at.e2
    key = f'{e1.text}-{e2.text}'

    group_by_period[key].append(observation)

### Build Time-Series

In [35]:
from typing import List
from extr import Entity, Relation

def get_data_from_entities(entities: List[Entity]):
    data = {}

    scores = [
        (entity)
        for entity in entities
        if entity.label=='SCORE'
    ]

    if len(scores) > 0:
        data['score'] = scores[0].text
    
    return data

def get_period_and_time(is_at: Relation):
    def normalize_time(time: str):
        parts = time.split(':')
        mins = (15 - int(parts[0]))
        secs =  int(parts[1])

        return float(f'{mins}.{secs}')

    e1 = is_at.e1
    e2 = is_at.e2
    return e1.text, normalize_time(e2.text)

def get_side_and_spot(is_spot_of_ball: Relation, offset_team: str):
    e1 = is_spot_of_ball.e1
    e2 = is_spot_of_ball.e2

    spot_of_ball = int(e2.text)
    if e1.text == offset_team:
        spot_of_ball = 50 + (50 - spot_of_ball)
    
    return e1.text, spot_of_ball

In [36]:
timeline = []
previous = []

i = 1

is_at_key = 'r("PERIOD", "TIME")'

for kbs in group_by_period.values():
    for seq_p, kb in enumerate(kbs):
        entities = kb['entities']

        extracted_entity_data = get_data_from_entities(kb['entities'])
        period, time = get_period_and_time(kb[is_at_key][0])

        yard_lines = []
        if not 'r("TEAM", "QUANTITY")' in kb:
            yard_lines = previous
        else:
            yard_lines = kb['r("TEAM", "QUANTITY")']
            previous = kb['r("TEAM", "QUANTITY")']

        for seq_l, loc in enumerate(yard_lines):
            side, spot_of_ball = get_side_and_spot(loc, 'BLT')

            data = {
                'period': period,
                'time': time,
                'side': side,
                'spot': spot_of_ball,
                'label': i
            }

            data.update(extracted_entity_data)

            timeline.append(data)

            i += 1

timeline

[{'period': '1st', 'time': 0.0, 'side': 'BLT', 'spot': 65, 'label': 1},
 {'period': '1st', 'time': 0.0, 'side': 'NYJ', 'spot': -3, 'label': 2},
 {'period': '1st', 'time': 0.0, 'side': 'NYJ', 'spot': 22, 'label': 3},
 {'period': '1st', 'time': 1.55, 'side': 'NYJ', 'spot': 41, 'label': 4},
 {'period': '1st', 'time': 1.55, 'side': 'NYJ', 'spot': 41, 'label': 5},
 {'period': '1st', 'time': 1.18, 'side': 'NYJ', 'spot': 46, 'label': 6},
 {'period': '1st', 'time': 1.1, 'side': 'NYJ', 'spot': 46, 'label': 7},
 {'period': '1st', 'time': 2.52, 'side': 'BLT', 'spot': 81, 'label': 8},
 {'period': '1st', 'time': 2.52, 'side': 'BLT', 'spot': 72, 'label': 9},
 {'period': '1st', 'time': 2.42, 'side': 'BLT', 'spot': 68, 'label': 10},
 {'period': '1st', 'time': 2.19, 'side': 'BLT', 'spot': 64, 'label': 11},
 {'period': '1st', 'time': 3.41, 'side': 'BLT', 'spot': 60, 'label': 12},
 {'period': '1st', 'time': 3.1, 'side': 'BLT', 'spot': 60, 'label': 13},
 {'period': '1st', 'time': 4.19, 'side': 'BLT', 'spo

In [37]:
import matplotlib.pyplot as plt

X = list(map(lambda a: a['norm-time'], timeline))
y = list(map(lambda a: a['yard-line'], timeline))

plt.figure(figsize=(10, 30))
plt.plot(y, X, '.-')

for i, obj in enumerate(timeline):
    plt.annotate(obj['label'], (y[i], X[i]))

plt.axvline(0, color='r', linewidth=.5)
plt.axvline(50, color='k', linewidth=.5)
plt.axvline(100, color='r', linewidth=.5)

for row in timeline:
    if len(row['score']) > 0:
        e = row['score'][0]
        ctypes = list(e.get_attributes_by_label('ctypes'))
        
        if len(ctypes) > 0:
            plt.plot(row['yard-line'], row['norm-time'], 'o', c='r')
        else:
            if e.text == 'TOUCHDOWN':
                plt.plot(row['yard-line'], row['norm-time'], 'o', c='pink')
            else:
                plt.plot(row['yard-line'], row['norm-time'], 'o', c='purple')

KeyError: 'norm-time'