In [1]:
import sys
import re
from bs4 import BeautifulSoup

sys.path.insert(0, '../../src')

from eeyore_nlp.pumps import WebPump, ContextPump
from eeyore_nlp.pipelines import ContextTokenizer, \
                                 TextPipeline, \
                                 TextPipe, \
                                 ContractionsTextPipe

from eeyore_nlp.utils import RelationshipBuilder
from eeyore_nlp.generators import MarkovChain
from eeyore_nlp.models import RelationshipContainer, RelationshipKey

In [2]:
class EspnContentScrapper(TextPipe):
    def __init__(self, order):
        super().__init__(order)
    
    def execute(self, text: str) -> str:
        bs = BeautifulSoup(text, 'html.parser')

        elements_to_remove = [
            bs.find_all('ul', 'article-social'),
            bs.find_all('div', 'article-meta'),
            bs.find_all('aside'),
            bs.find_all('div', 'teads-inread'),
            bs.find_all('figure'),
            bs.find_all('div', 'cookie-overlay')
        ]

        for element_search in elements_to_remove:
            for tag in element_search:
                tag.decompose()

        for a in bs.find_all('a'):
            a.replaceWith(a.text)

        p = [ p.text for p in bs.find_all('p') ]

        return '\n'.join(p).strip()

class EspnTextTransformer(TextPipe):
    def __init__(self, order):
        super().__init__(order)

    def execute(self, text: str) -> str:
        ## annoying terms
        document = re.sub(r'(\s)No[.](\s)', '\g<1> Number \g<2>', text)

        document = re.sub(r'(^|\s)"', '\g<1>``', document)
        document = re.sub(r'"(\s|$)', "''\g<1>", document)

        ## numbers
        document = re.sub(r'[$]\s*[\d]+[.]?[\d]*', ' PRICE ', document)
        document = re.sub(r'[\d]+[.]?[\d]*\s*[%]', ' PERCENTAGE ', document)
        document = re.sub(r'[\d]+[.][\d]+', ' FLOAT ', document)
        document = re.sub(r'[\d]+', ' INTEGER ', document)
        
        return document

In [3]:
context_pump = ContextPump(
    WebPump([
        'https://www.espn.com/nfl/story/_/id/31433376/',
        'https://www.espn.com/nfl/story/_/id/31431382/',
        'https://www.espn.com/nfl/story/_/id/31431361/',
    ]),
    ContextTokenizer(
        text_preprocessor=TextPipeline(pipes=[
            EspnContentScrapper(1),
            ContractionsTextPipe(2),
            EspnTextTransformer(3),
        ])
    )
)

In [4]:
contexts = list(context_pump.execute())

print(contexts[0].sentence)
print()

print(contexts[1].sentence)
print()

print(contexts[2].sentence)
print()

Former Carolina quarterback Teddy Bridgewater on Wednesday questioned how the Panthers practiced in two critical areas that played a part in the organization moving on from him after the first year of a three-year contract.

``I will just say this, for Joe Brady's growth they will have to practice different things in different ways,'' Bridgewater said of Carolina's offensive coordinator on the ``All Things Covered'' CBS Sports podcast with Patrick Peterson and Bryant McFadden.

``One of the things we did not do much of when I was there, we did not practice two-minute drills, we did not practice red zone.



In [5]:
container = RelationshipContainer()
builder = RelationshipBuilder()

for context in contexts:
    relationships = builder.create_neighbor_relationships(
        context.get('tokens')
    )
    container.add_many(relationships)

assert RelationshipKey('<start>') in container
assert RelationshipKey('<end>') in container

In [6]:
chain = MarkovChain()
for i in range(6):
    print(
        [
            output.primary.term
            for output
            in chain.generate(
                container,
                kill=8)
        ]
    )
    print()

['<start>', 'Gronkowski', ',', "''", 'Rhule', 'has', 'done', 'that']

['<start>', 'We', 'traded', 'to', 'have', 'been', 'a', 'leader']

['<start>', 'They', 'also', 'mark', 'the', 'quarterback', 'after', 'the']

['<start>', 'The', 'Panthers', 'hosting', 'his', 'fourth', 'rounds', 'in']

['<start>', 'Since', 'the', 'things', 'we', 'believe', 'in', 'INTEGER']

['<start>', 'We', 'did', 'not', 'matter', '.', '<end>']

