In [63]:
import spacy
import json

In [64]:
spacy.__version__

'2.1.4'

In [65]:
nlp = spacy.load("en_core_web_md")

In [100]:
customize_stop_words = [
    "Race", "Races", "race",
    "info", "about", 
    "finish", "line", "swim", "bike", "run", 'athlete', 'ironman'
]
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True

In [119]:
import string

# Create our list of punctuation marks
punctuations = string.punctuation


def extended_is_stop(token):
    stop_words = nlp.Defaults.stop_words
    return token.is_stop or token.lower_ in stop_words or token.lemma_ in stop_words

def filterTokenByStop(doc):
    return [token for token in doc if not extended_is_stop(token)]

# Creating our tokenizer function
def spacy_tokenizer(tokens):
    # creating the filter list for tokens that are identified as person
#     toRemove = [ent for ent in tokens.ents if ent.label_.lower() in ["person"]]

    # Removing stop words
    mytokens = [ 
        word for word in tokens
            if not extended_is_stop(word) 
            and word.lower_ not in punctuations 
    ]
    
    # Removing stop words
    mytokens = [ word for word in tokens if not extended_is_stop(word) and word.lower_ not in punctuations ]

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # return preprocessed list of tokens
    return mytokens

In [68]:
# # add stop words
# nlp.Defaults.stop_words |= {"race","info"}


In [70]:
descriptions = {}

with open("./../data/races/races-description.jl") as f:
    for line in f.readlines():
        data = json.loads(line.strip())
        if (data.get('id', "TBD") != "TBD"):
            descriptions[data['id']] = data
        else:
            descriptions[f"TBD_{data['name']}"] = data

In [71]:
len(descriptions)

223

In [72]:
for key in list(descriptions.keys())[1:2]:
    desc = descriptions[key]['description']
    doc = nlp(desc)

In [174]:
# print(f"--- {key} ---")
# for token in doc:
#     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
#             token.shape_, token.is_alpha, token.is_stop)

# for ent in doc.ents:
#     print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [102]:
race1,race2,race3 = list(descriptions.keys())[1:4]
desc1 = descriptions[race1]['description']
desc2 = descriptions[race2]['description']
desc3 = descriptions['santarosa']['description']
doc1 = nlp(desc1)
doc2 = nlp(desc2)
doc3 = nlp(desc3)

In [172]:
descriptions['mallorca70.3']['description']

'Races Mallorca The balearic island of Mallorca, host of the IRONMAN 70.3 Mallorca, is well known as a training paradise for triathletes from all over the world. The course takes athletes on a tour through many villages that offer incredible views and diversity. The event location, Alcúdia, is one of the most celebrated towns of Mallorca, with more than 30 km of the coastline made up of gorgeous, fine sand beaches, steep cliffs and secluded coves of great beauty. The area is equipped with the most modern infrastructures and a tranquil surrounding, making it ideal for relaxation. The unique destination has activities for all tastes that include water sports, trails and mountains for hiking, biking and golf. The country offers many nature areas, which attract tourists from all over the world. The small peninsula offers uncommonly rich landscapes, marked by the intense blue of the Mediterranean Sea. The temperate, pleasant climate makes it possible to partake in countless outdoor sports a

In [173]:
tags = ['family', 'award', 'kid', 'activity', 'nature', 'fast', 'slot', 'beginner', 'friendly', 'community', 'tourism', 'relax', 'relaxation', 'visit', 'discontinue']
for race in descriptions:
    has_tags = []
    desc = descriptions[race]['description']
    doc = nlp(desc)
    text = [token.lemma_.lower() for token in doc]
    for tag in tags:
        if tag in text:
            has_tags.append(tag)
    if len(has_tags)>0:
        print(race, has_tags)

mallorca70.3 ['activity', 'nature', 'slot', 'relaxation']
france70.3 ['award', 'slot']
australia ['slot', 'community']
portmacquarie70.3 ['family', 'nature', 'fast', 'slot', 'community']
vietnam70.3 ['award', 'fast', 'visit']
busselton70.3 ['award', 'activity', 'nature', 'fast', 'slot', 'visit']
marbella70.3 ['family', 'activity', 'slot']
texas ['slot']
liuzhou70.3 ['slot', 'friendly']
virginia70.3 ['slot']
CostaNavarino70.3 ['slot']
monterrey70.3 ['family', 'fast', 'slot', 'visit']
taiwan70.3 ['award', 'slot', 'visit']
southafrica ['award', 'fast', 'slot', 'visit']
newzealand ['award', 'slot', 'community']
davao70.3 ['nature', 'fast', 'slot', 'friendly', 'community']
peru70.3 ['fast', 'slot', 'tourism']
Florianopolis70.3 ['activity', 'fast', 'slot']
sanjuan70.3 ['slot', 'visit']
oman70.3 ['slot', 'tourism']
campeche70.3 ['fast', 'slot', 'friendly']
newzealand70.3 ['family', 'activity', 'nature', 'fast', 'slot', 'community', 'tourism']
pucon70.3 ['slot']
dubai70.3 ['award', 'fast', 'sl

In [87]:
doc1.similarity(doc2)

0.9711980364821625

In [76]:
doc1.similarity(doc3)

0.9638568698800685

In [77]:
doc3[0].lemma_

'Races'

In [36]:
extended_is_stop(doc3[0])

False

In [55]:
doc3[3].is_stop

False

In [43]:
test = nlp("races and me")

In [46]:
test[0].lemma_

'race'

In [61]:
tokens = [token.text for token in doc3 if not extended_is_stop(token)]
tokens

['Races',
 'Santa',
 'Rosa',
 'new',
 'Santa',
 'Rosa',
 'offers',
 'high',
 '-',
 'quality',
 'event',
 'come',
 'know',
 'love',
 'heart',
 'Sonoma',
 'County',
 ',',
 'winding',
 'roads',
 'dotted',
 'vineyards',
 'world',
 '-',
 'class',
 'accommodation',
 '.',
 'past',
 'results',
 'Vineman',
 ',',
 'visit',
 'event',
 'page',
 '.',
 'Staged',
 'beautiful',
 'Sonoma',
 'County',
 'traversing',
 'quaint',
 'communities',
 'like',
 'Geyersville',
 'Healdsburg',
 ',',
 'Santa',
 'Rosa',
 'draws',
 'triathletes',
 ',',
 'wine',
 'food',
 'loving',
 'family',
 'support',
 'crews',
 'year',
 '.',
 'event',
 'ticket',
 'heart',
 'world',
 '-',
 'famous',
 'wine',
 'regions',
 'Napa',
 'Sonoma',
 'valleys',
 ',',
 ',',
 'want',
 'leave',
 '.',
 'B&Bs',
 'boutique',
 'hotel',
 'options',
 'suit',
 'budget',
 ',',
 'thousands',
 'wineries',
 ',',
 'craft',
 'breweries',
 ',',
 'farm',
 '-',
 '-',
 'table',
 'restaurants',
 'pre',
 '-',
 'fueling',
 'post',
 '-',
 'celebrations',
 '.',
 'off

In [28]:
descriptions['santarosa']['description']

'Races Santa Rosa Reasons to Race IRONMAN Santa Rosa The 2020 IRONMAN Santa Rosa will offer 40 qualifying slots to the 2020 IRONMAN World Championship in Kailua-Kona, Hawaii.'

In [81]:
import nltk

In [83]:
from nltk.corpus import stopwords