In [30]:
from typing import List
from extr import RegEx, RegExLabel, RegExRelationLabelBuilder
from extr.entities import create_entity_extractor, EntityAnnotator
from extr.relations import RelationExtractor

In [31]:
kb = {
    'PERIOD': [
        '1st',
        '2nd',
        '3rd',
        '4th',
        'OT',
    ],
    'TEAM': [
        'ARZ',
        'Arizona',
        'ATL',
        'Atlanta',
        'BLT',
        'Baltimore',
        'BUF',
        'Buffalo',
        'CAR',
        'Carolina',
        'CHI',
        'Chicago',
        'CIN',
        'Cinncinatti',
        'CLV',
        'Cleveland',
        'DAL',
        'Dallas',
        'DEN',
        'Denver',
        'DET',
        'Detroit',
        'GB',
        'Green Bay',
        'HST',
        'Houston',
        'IND',
        'Indianapolis',
        'JAX',
        'Jacksonville',
        'KC',
        'Kansas City',
        'LA',
        'Los Angeles Rams',
        'LAC',
        'Los Angeles Chargers',
        'LV',
        'Las Vegas',
        'MIA',
        'Miami',
        'MIN',
        'Minnesota',
        'NE',
        'New England',
        'NO',
        'New Orleans',
        'NYG',
        'New York Giants',
        'NYJ',
        'New York Jets',
        'PHI',
        'Philadelphia',
        'PIT',
        'Pittsburgh',
        'SEA',
        'Seattle',
        'SF',
        'San Francisco',
        'TB',
        'Tampa Bay',
        'TEN',
        'Tennessee',
        'WAS',
        'Washington',
    ]
}

entity_patterns: List[RegExLabel] = [
    RegExLabel(
        label='TIME',
        regexes=[
            RegEx(expressions=[
                r'\b[0-9]{1,2}:[0-9]{2}\b',
            ]),
        ],
    ),
    RegExLabel(
        label='QUANTITY',
        regexes=[
            RegEx(expressions=[
                r'(?<=[\s\(])-?\d{1,3}(?=\b)',
            ])
        ]
    ),
]

entity_extractor = create_entity_extractor(entity_patterns, kb)
relation_extractor = RelationExtractor(relation_labels=[
    RegExRelationLabelBuilder('is_at') \
        .add_e2_to_e1(
            e2='TIME',
            relation_expressions=[
                r'(\s+-\s+)',
            ],
            e1='PERIOD'
        ) \
        .build(),
    RegExRelationLabelBuilder('is_at') \
        .add_e2_to_e1(
            e2='TIME',
            relation_expressions=[
                r'(\s+-\s+)',
            ],
            e1='PERIOD'
        ) \
        .build(),
    RegExRelationLabelBuilder('is_spot_of_ball') \
        .add_e1_to_e2(
            e1='TEAM',
            relation_expressions=[
                r'\s+',
            ],
            e2='QUANTITY',
        ) \
        .build()
])

In [32]:
instances = []
with open('./data/2/2022_w1_g401437630.txt', 'r') as source_file:
    instances = source_file.read().split('\n')

In [36]:
output = []
for instance in instances:
    entities = entity_extractor.get_entities(instance)
    relations = relation_extractor.extract(
        EntityAnnotator().annotate(instance, entities),
        entities
    )

    if len(relations) > 0:
        kb = {
            'entities': entities
        }

        for relation in relations:
            if not relation.definition in kb:
                kb[relation.definition] = []

            kb[relation.definition].append(relation)

        output.append(kb)

output


[{'entities': [<Entity label="QUANTITY" text="35" span=(48, 50)>,
   <Entity label="TEAM" text="MIA" span=(44, 47)>,
   <Entity label="QUANTITY" text="65" span=(30, 32)>,
   <Entity label="PERIOD" text="1st" span=(9, 12)>,
   <Entity label="TIME" text="15:00" span=(1, 6)>],
  'r("PERIOD", "TIME")': [<Relation e1="1st" r="is_at" e2="15:00">,
   <Relation e1="1st" r="is_at" e2="15:00">],
  'r("TEAM", "QUANTITY")': [<Relation e1="MIA" r="is_spot_of_ball" e2="35">]},
 {'entities': [<Entity label="QUANTITY" text="9" span=(74, 75)>,
   <Entity label="QUANTITY" text="34" span=(67, 69)>,
   <Entity label="TEAM" text="NE" span=(64, 66)>,
   <Entity label="PERIOD" text="1st" span=(9, 12)>,
   <Entity label="TIME" text="15:00" span=(1, 6)>],
  'r("PERIOD", "TIME")': [<Relation e1="1st" r="is_at" e2="15:00">,
   <Relation e1="1st" r="is_at" e2="15:00">],
  'r("TEAM", "QUANTITY")': [<Relation e1="NE" r="is_spot_of_ball" e2="34">]},
 {'entities': [<Entity label="QUANTITY" text="12" span=(47, 49)>,
 

In [45]:
timeline = [

]

for kb in output:
    if not 'r("PERIOD", "TIME")' in kb:
        continue
    if not 'r("TEAM", "QUANTITY")' in kb:
        continue

    at = kb['r("PERIOD", "TIME")'][0]
    period, time = at.e1.text, at.e2.text

    loc = kb['r("TEAM", "QUANTITY")'][0]
    team, ydline = loc.e1.text, loc.e2.text

    blah = int(ydline)

    if team == 'MIA': 
        blah = 50 + (50 - blah)

    parts = time.split(':')
    p1 = 15 - int(parts[0])
    p2 = int(parts[1])

    timeline.append({
        'period': period,
        'time': time,
        'norm_time': float(f'{p1}.{p2}'),
        'side': team,
        'ydline': blah
    })

timeline

[{'period': '1st',
  'time': '15:00',
  'norm_time': 0.0,
  'side': 'MIA',
  'ydline': 65},
 {'period': '1st',
  'time': '15:00',
  'norm_time': 0.0,
  'side': 'NE',
  'ydline': 34},
 {'period': '1st',
  'time': '15:00',
  'norm_time': 0.0,
  'side': 'NE',
  'ydline': 46},
 {'period': '1st',
  'time': '15:00',
  'norm_time': 0.0,
  'side': 'NE',
  'ydline': 49},
 {'period': '1st',
  'time': '13:01',
  'norm_time': 2.1,
  'side': 'MIA',
  'ydline': 57},
 {'period': '1st',
  'time': '12:20',
  'norm_time': 3.2,
  'side': 'MIA',
  'ydline': 65},
 {'period': '1st',
  'time': '11:33',
  'norm_time': 4.33,
  'side': 'MIA',
  'ydline': 67},
 {'period': '1st',
  'time': '10:48',
  'norm_time': 5.48,
  'side': 'MIA',
  'ydline': 78},
 {'period': '1st',
  'time': '10:05',
  'norm_time': 5.5,
  'side': 'MIA',
  'ydline': 103},
 {'period': '1st',
  'time': '9:47',
  'norm_time': 6.47,
  'side': 'MIA',
  'ydline': 64},
 {'period': '1st',
  'time': '9:05',
  'norm_time': 6.5,
  'side': 'MIA',
  'ydl

In [47]:
import matplotlib.pyplot as plt

X = list(map(lambda a: a['norm_time'], timeline))
y = list(map(lambda a: a['ydline'], timeline))

plt.plot(X, y)

[65,
 34,
 46,
 49,
 57,
 65,
 67,
 78,
 103,
 64,
 59,
 57,
 22,
 17,
 18,
 26,
 65,
 32,
 44,
 41,
 48,
 48,
 60,
 53,
 51,
 60,
 94,
 89,
 85,
 83,
 80,
 69,
 55,
 49,
 56,
 49,
 15,
 6,
 65,
 29,
 56,
 58,
 54,
 56,
 92,
 91,
 77,
 74,
 66,
 66,
 45,
 54,
 42,
 65,
 23,
 35,
 67,
 65,
 66,
 8,
 14,
 23,
 35,
 35,
 40,
 52,
 58,
 58,
 70,
 70,
 72,
 72,
 77,
 92,
 94,
 35,
 63,
 55,
 52,
 36,
 32,
 31,
 65,
 31,
 38,
 42,
 42,
 42,
 90,
 82,
 81,
 31,
 43,
 36,
 37,
 29,
 41,
 41,
 5,
 16,
 57,
 62,
 51,
 35,
 35,
 24,
 16,
 13,
 11,
 11]