In [1]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag_sents
from nltk.tokenize import word_tokenize

In [7]:
relevant_tags = {
    "CC": "Coordinating conjunction",
    "DT": "Determiner",
    "FW": "Foreign word",
    "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective",
    "JJR": "Comparative adjective",
    "JJS": "Superlative adjective",
    "MD": "Modal",
    "NN": "Noun",
    "NNS": "Plural noun",
    "PDT": "Predeterminer",
    "PRP": "Personal pronoun",
    "PRP$": "Possessive pronoun",
    "RB": "Adverb",
    "RBR": "Comparative adverb",
    "RBS": "Superlative adverb",
    "RP": "Particle",
    "UH": "Interjection",
    "VB": "Verb (base form)",
    "VBD": "Verb (past tense)",
    "VBG": "Verb (gerund or present participle)",
    "VBN": "Verb (past participle)",
    "VBP": "Verb (non-3rd person singular present)",
    "VBZ": "Verb (3rd person singular present)",
    "a": "Adjective",
    "n": "Noun",
    "r": "Adverb",
    "v": "verb",
}


In [8]:
# POS Pennstate tags to WordnetLemmatiser tags
pos_tags_map = {
    "CC": None,
    "DT": None,
    "FW": None,
    "IN": "a",
    "JJ": "a",
    "JJR": "a",
    "JJS": "a",
    "MD": None,
    "NN": "n",
    "NNS": "n",
    "PDT": None,
    "PRP": None,
    "PRP$": None,
    "RB": "r",
    "RBR": "r",
    "RBS": "r",
    "RP": None,
    "UH": None,
    "VB": "v",
    "VBD": "v",
    "VBG": "v",
    "VBN": "v",
    "VBP": "v",
    "VBZ": "v"
}

In [9]:
def is_relevant_token(token_tag_tuple) -> bool:
    _, tag = token_tag_tuple
    return tag in relevant_tags.keys()

In [10]:
with open('assets/dev-samples/harry-potter.content.txt', 'r') as f:
    contents = f.readlines()

In [11]:
# Tokenise
tokenised_sents = []
for c in contents:
    tokens = word_tokenize(c)
    if tokens:
        tokenised_sents.append(tokens)

In [12]:
# Tag
tagged_sentences = pos_tag_sents(tokenised_sents)

In [13]:
# Only keep wanted tokens
relevant_tokens = []
for ts in tagged_sentences:
    relevant_tokens += filter(lambda x: is_relevant_token(x), ts)

In [14]:
# Normalise

# Lower
relevant_tokens = list(map(lambda xy: (xy[0].lower(), xy[1]), relevant_tokens))


In [15]:
# Lemmatise
lemmatiser = WordNetLemmatizer()
lemmatised_tokens = set()
for rt in relevant_tokens:
    token, tag = rt
    lemmatag = pos_tags_map[tag]
    if lemmatag:
        lt = lemmatiser.lemmatize(token, pos=lemmatag)
        lemmatised_tokens.add((lt, lemmatag))
    else:
        lt = lemmatiser.lemmatize(token)
        lemmatised_tokens.add((lt, tag))

In [18]:
lemmatised_tokens

{('grin', 'n'),
 ('tried', 'a'),
 ('dot', 'v'),
 ('treacle', 'n'),
 ('meter', 'n'),
 ('my', 'PRP$'),
 ('beady', 'a'),
 ('anxiety', 'n'),
 ('shriek', 'a'),
 ('fix', 'v'),
 ('mom', 'n'),
 ('effort', 'n'),
 ('dribble', 'v'),
 ('shine', 'v'),
 ('teabags', 'n'),
 ('weak', 'a'),
 ('spider', 'a'),
 ('clear', 'a'),
 ('fire', 'n'),
 ('moving', 'n'),
 ('bloke', 'v'),
 ('lecture', 'n'),
 ('archway', 'a'),
 ('swap', 'v'),
 ('sec', 'n'),
 ('de', 'a'),
 ('watery', 'a'),
 ('nightfall', 'DT'),
 ('hid', 'n'),
 ('fair', 'n'),
 ('explode', 'v'),
 ('knowin', 'n'),
 ('choke', 'v'),
 ('measure', 'v'),
 ('zigzag', 'v'),
 ('tomato', 'n'),
 ('bolt', 'v'),
 ('flock', 'n'),
 ('ill', 'n'),
 ('coffee', 'n'),
 ('bring', 'v'),
 ('roar', 'v'),
 ('prefer', 'v'),
 ('drawl', 'v'),
 ('rowboat', 'n'),
 ('truly', 'r'),
 ('take', 'n'),
 ('black-haired', 'a'),
 ('stump', 'n'),
 ('deliver', 'v'),
 ('rot', 'n'),
 ('parking', 'n'),
 ('warn', 'v'),
 ('-where', 'r'),
 ('caught', 'n'),
 ('nonmagic', 'a'),
 ('stair', 'n'),
 ('think