In [5]:
import nltk

In [6]:
text1 = 'R.Ahmed kicks 62 yards from GB 35 to DAL 3. K.Turpin to DAL 39 for 36 yards (J.Abram, R.Ahmed).'
text2 = '(8:41 - 1st) B.Anger punts 37 yards to GB 15, Center-M.Overton. Am.Rodgers to GB 17 for 2 yards (L.Gifford).'
text3 = '(4:54 - 2nd) Christian Watson Pass From Aaron Rodgers for 58 Yds M.Crosby extra point is GOOD, Center-J.Coco, Holder-P.O\'Donnell.'

In [23]:
import nltk
import re

from typing import List, Dict, Optional

from extr.entities import create_entity_extractor
from extr import RegExLabel, RegEx
from extr_ds.rules import Majority, StaticUnit
from extr_ds.rules.units import create_static_inference_result


class RegExStaticUnit(StaticUnit):
    def __init__(self, regex_labels: List[RegExLabel], kb: Optional[Dict[str, List[str]]] = None) -> None:
        super().__init__(
            nltk.tokenize.word_tokenize,
            create_entity_extractor(
                regex_labels=regex_labels,
                kb=kb
            )
        )

class Player(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[
                RegExLabel(
                    label='PLAYER',
                    regexes=[
                        RegEx(
                            expressions=[
                                r'\b[A-Z][a-z]?\.[A-Z][a-z]+\b',
                                r'\b[A-Z][a-z]?\.[A-Z]\'[A-Z][a-z]+\b',
                                r'(?<=Pass From ).+?(?= for)',
                                r'(?<=\) ).+?(?= Pass From)',
                            ]
                        )
                    ]
                )
            ]
        )

class Team(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[],
            kb={
                'TEAM': [
                    'ARZ',
                    'Arizona',
                    'ATL',
                    'Atlanta',
                    'BLT',
                    'Baltimore',
                    'BUF',
                    'Buffalo',
                    'CAR',
                    'Carolina',
                    'CHI',
                    'Chicago',
                    'CIN',
                    'Cinncinatti',
                    'CLV',
                    'Cleveland',
                    'DAL',
                    'Dallas',
                    'DEN',
                    'Denver',
                    'DET',
                    'Detroit',
                    'GB',
                    'Green Bay',
                    'HST',
                    'Houston',
                    'IND',
                    'Indianapolis',
                    'JAX',
                    'Jacksonville',
                    'KC',
                    'Kansas City',
                    'LA',
                    'Los Angeles Rams',
                    'LAC',
                    'Los Angeles Chargers',
                    'LV',
                    'Las Vegas',
                    'MIA',
                    'Miami',
                    'MIN',
                    'Minnesota',
                    'NE',
                    'New England',
                    'NO',
                    'New Orleans',
                    'NYG',
                    'New York Giants',
                    'NYJ',
                    'New York Jets',
                    'PHI',
                    'Philadelphia',
                    'PIT',
                    'Pittsburgh',
                    'SEA',
                    'Seattle',
                    'SF',
                    'San Francisco',
                    'TB',
                    'Tampa Bay',
                    'TEN',
                    'Tennessee',
                    'WAS',
                    'Washington',
                ]
            }
        )

class Time(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[
                RegExLabel(
                    label='TIME',
                    regexes=[
                        RegEx(
                            expressions=[
                                r'\b[0-9]{1,2}:[0-9]{2}\b',
                            ]
                        )
                    ]
                )
            ]
        )

class Period(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[],
            kb={
                'PERIOD': [
                    '1st',
                    '2nd',
                    '3rd',
                    '4th',
                    'OT',
                ],
            }
        )

class Quantity(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[
                RegExLabel(
                    label='QUANTITY',
                    regexes=[
                        RegEx(expressions=[
                            r'(?<=[\s\(])-?\d{1,3}(?=\b)',
                        ])
                    ]
                )
            ]
        )

class Units(RegExStaticUnit):
    def __init__(self):
        super().__init__(
            regex_labels=[
                RegExLabel(
                    label='UNITS',
                    regexes=[
                        RegEx(
                            expressions=[
                                r'\b(y(?:ar)?ds)\b',
                            ],
                            flags=re.IGNORECASE
                        )
                    ]
                )
            ]
        )

text = text1

units = [
    Player(),
    Team(),
    Time(),
    Period(),
    Quantity(),
    Units(),
]

def clean(document):
    text = document[:]

    text = re.sub(
        r'(\w)(-)([A-Z][a-z]*\.[A-Z])',
        r'\1 \2 \3',
        text
    )

    text = re.sub(
        r'\b(\d+)(-)([A-Z])',
        r'\2 \3',
        text
    )

    text = re.sub(
        r'\b(\d+)([A-Z]\.)',
        r'\1 \2',
        text
    )

    text = re.sub(
        r'(injured during the play\.)',
        r'\1 ',
        text
    )

    text = re.sub(r' +', ' ', text)

    return text

def annotate(document):
    text = clean(document)

    results = units[0](text)
    for unit in units[1:]:
        results = create_static_inference_result(
            labels=Majority().merge([
                results,
                unit(text),
            ]),
            weight=results.weight
        )

    return {
        'tokens': nltk.tokenize.word_tokenize(text),
        'labels': results.labels
    }

annotations = [
    annotate(text)
]

annotations[0]['labels']

['B-PLAYER',
 'O',
 'B-QUANTITY',
 'B-UNITS',
 'O',
 'B-TEAM',
 'B-QUANTITY',
 'O',
 'B-TEAM',
 'B-QUANTITY',
 'O',
 'B-PLAYER',
 'O',
 'B-TEAM',
 'B-QUANTITY',
 'O',
 'B-QUANTITY',
 'B-UNITS',
 'O',
 'B-PLAYER',
 'O',
 'B-PLAYER',
 'O',
 'O']

In [19]:
import json
import random

k = 1000

with open('./1/source.txt', 'r') as source_input:
    rows = source_input.read().split('\n')

random.seed(42)
random.shuffle(rows)

annotations = []
for row in rows[0:k]:
    annotations.append(
        annotate(row)
    )

with open('./5/ents-iob.json', 'w') as output_iob:
    output_iob.write(json.dumps(annotations))