In [67]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag_sents, map_tag
from nltk.tokenize import word_tokenize

In [68]:
map_tag(source="en-ptb", target="universal", source_tag="IN")

'ADP'

In [69]:
# Maybe use https://universaldependencies.org/u/pos/index.html
# or map to them if they're not supported by nltk
# (it's most used multilingual tagset)
relevant_tags = {
    "CC": "Coordinating conjunction",
    "DT": "Determiner",
    "FW": "Foreign word",
    "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective",
    "JJR": "Comparative adjective",
    "JJS": "Superlative adjective",
    "MD": "Modal",
    "NN": "Noun",
    "NNS": "Plural noun",
    "PDT": "Predeterminer",
    "PRP": "Personal pronoun",
    "PRP$": "Possessive pronoun",
    "RB": "Adverb",
    "RBR": "Comparative adverb",
    "RBS": "Superlative adverb",
    "RP": "Particle",
    "UH": "Interjection",
    "VB": "Verb (base form)",
    "VBD": "Verb (past tense)",
    "VBG": "Verb (gerund or present participle)",
    "VBN": "Verb (past participle)",
    "VBP": "Verb (non-3rd person singular present)",
    "VBZ": "Verb (3rd person singular present)",
    "a": "Adjective",
    "n": "Noun",
    "r": "Adverb",
    "v": "verb",
}


In [70]:
# POS Pennstate tags to WordnetLemmatiser tags
pos_tags_map = {
    "CC": None,
    "DT": None,
    "FW": None,
    "IN": "a",
    "JJ": "a",
    "JJR": "a",
    "JJS": "a",
    "MD": None,
    "NN": "n",
    "NNS": "n",
    "PDT": None,
    "PRP": None,
    "PRP$": None,
    "RB": "r",
    "RBR": "r",
    "RBS": "r",
    "RP": None,
    "UH": None,
    "VB": "v",
    "VBD": "v",
    "VBG": "v",
    "VBN": "v",
    "VBP": "v",
    "VBZ": "v"
}

In [89]:
_UNIVERSAL_TAGS = (
    "VERB",
    "NOUN",
    "PRON",
    "ADJ",
    "ADV",
    "ADP",
    "CONJ",
    "DET",
    "NUM",
    "PRT",
    "X",
    ".",
)
tagmap_universal_lemmatizer = {
    "VERB": "v",
    "NOUN": "n",
    "ADJ": "a",
    "ADV": "r",
}

relevant_universal = (
    "VERB",
    "NOUN",
    "ADJ",
    "ADV",
    "ADP",
    "CONJ",
    "DET",
)

In [72]:
def is_relevant_tag(tag) -> bool:
    return tag in relevant_universal

In [73]:
with open('assets/dev-samples/harry-potter.content.txt', 'r') as f:
    contents = f.readlines()

In [74]:
with open('assets/stopwords/en.txt', 'r') as f:
    stop_words = [w.strip() for w in f.readlines()]

In [75]:
stop_words

['a',
 "a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'b',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 'c',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'c

In [76]:
contents

['\n',
 "Harry Potter and the Sorcerer's Stone\n",
 "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. \n",
 'Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. The Dursleys had a small son called Dudley and in their opinion there was no finer boy anywhere. \n',
 "The Dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. They didn't think they could bear it if anyone found out about the Potters. Mrs. Pott

In [77]:
# Tokenise
tokenised_sents = []
for c in contents:
    tokens = word_tokenize(c)
    if tokens:
        tokenised_sents.append(tokens)

In [78]:
tokenised_sents = [
    [t for t in sent if t not in stop_words] for sent in tokenised_sents
]

In [79]:
tokenised_sents

[['Harry', 'Potter', 'Sorcerer', "'s", 'Stone'],
 ['Mr.',
  'Mrs.',
  'Dursley',
  ',',
  'number',
  ',',
  'Privet',
  'Drive',
  ',',
  'proud',
  'perfectly',
  'normal',
  ',',
  '.',
  'They',
  'people',
  "'d",
  'expect',
  'involved',
  'strange',
  'mysterious',
  ',',
  "n't",
  'hold',
  'nonsense',
  '.'],
 ['Mr.',
  'Dursley',
  'director',
  'firm',
  'called',
  'Grunnings',
  ',',
  'made',
  'drills',
  '.',
  'He',
  'big',
  ',',
  'beefy',
  'man',
  'neck',
  ',',
  'large',
  'mustache',
  '.',
  'Mrs.',
  'Dursley',
  'thin',
  'blonde',
  'usual',
  'amount',
  'neck',
  ',',
  'spent',
  'time',
  'craning',
  'garden',
  'fences',
  ',',
  'spying',
  'neighbors',
  '.',
  'The',
  'Dursleys',
  'small',
  'son',
  'called',
  'Dudley',
  'opinion',
  'finer',
  'boy',
  '.'],
 ['The',
  'Dursleys',
  'wanted',
  ',',
  'secret',
  ',',
  'greatest',
  'fear',
  'discover',
  '.',
  'They',
  "n't",
  'bear',
  'found',
  'Potters',
  '.',
  'Mrs.',
  'Potte

In [80]:
# Tag
tagged_sentences = pos_tag_sents(tokenised_sents)
# We use the default tagset (en-ptb). It's more finegrained than the universal tagset,
# which is not needed as we only want to filter out numbers and proper nouns which are
# also included in the universal tagset. However, for some reason the nltk pos tagger
# performs worse with proper nouns when using the unievrsal tagset (Harry tagged as NNP turns out as
# NOUN rather than NOUNP)

In [81]:
tagged_sentences

[[('Harry', 'NNP'),
  ('Potter', 'NNP'),
  ('Sorcerer', 'NNP'),
  ("'s", 'POS'),
  ('Stone', 'NN')],
 [('Mr.', 'NNP'),
  ('Mrs.', 'NNP'),
  ('Dursley', 'NNP'),
  (',', ','),
  ('number', 'NN'),
  (',', ','),
  ('Privet', 'NNP'),
  ('Drive', 'NNP'),
  (',', ','),
  ('proud', 'VBZ'),
  ('perfectly', 'RB'),
  ('normal', 'JJ'),
  (',', ','),
  ('.', '.'),
  ('They', 'PRP'),
  ('people', 'NNS'),
  ("'d", 'MD'),
  ('expect', 'VB'),
  ('involved', 'JJ'),
  ('strange', 'JJ'),
  ('mysterious', 'NNS'),
  (',', ','),
  ("n't", 'RB'),
  ('hold', 'VB'),
  ('nonsense', 'NN'),
  ('.', '.')],
 [('Mr.', 'NNP'),
  ('Dursley', 'NNP'),
  ('director', 'NN'),
  ('firm', 'NN'),
  ('called', 'VBD'),
  ('Grunnings', 'NNP'),
  (',', ','),
  ('made', 'VBD'),
  ('drills', 'NNS'),
  ('.', '.'),
  ('He', 'PRP'),
  ('big', 'JJ'),
  (',', ','),
  ('beefy', 'JJ'),
  ('man', 'NN'),
  ('neck', 'NN'),
  (',', ','),
  ('large', 'JJ'),
  ('mustache', 'NN'),
  ('.', '.'),
  ('Mrs.', 'NNP'),
  ('Dursley', 'NNP'),
  ('thin', 

In [82]:
# Map en-ptb to universal tagset
tagged_sentences_universal = []
for sentence in tagged_sentences:
    sentence_new = [
        (token, map_tag(source='en-ptb', target='universal', source_tag=tag))
        if tag != 'NNP'
        else (token, 'PROPN')
        for token, tag in sentence
    ]
    tagged_sentences_universal.append(sentence_new)

In [83]:
tagged_sentences_universal

[[('Harry', 'NOUNP'),
  ('Potter', 'NOUNP'),
  ('Sorcerer', 'NOUNP'),
  ("'s", 'PRT'),
  ('Stone', 'NOUN')],
 [('Mr.', 'NOUNP'),
  ('Mrs.', 'NOUNP'),
  ('Dursley', 'NOUNP'),
  (',', '.'),
  ('number', 'NOUN'),
  (',', '.'),
  ('Privet', 'NOUNP'),
  ('Drive', 'NOUNP'),
  (',', '.'),
  ('proud', 'VERB'),
  ('perfectly', 'ADV'),
  ('normal', 'ADJ'),
  (',', '.'),
  ('.', '.'),
  ('They', 'PRON'),
  ('people', 'NOUN'),
  ("'d", 'VERB'),
  ('expect', 'VERB'),
  ('involved', 'ADJ'),
  ('strange', 'ADJ'),
  ('mysterious', 'NOUN'),
  (',', '.'),
  ("n't", 'ADV'),
  ('hold', 'VERB'),
  ('nonsense', 'NOUN'),
  ('.', '.')],
 [('Mr.', 'NOUNP'),
  ('Dursley', 'NOUNP'),
  ('director', 'NOUN'),
  ('firm', 'NOUN'),
  ('called', 'VERB'),
  ('Grunnings', 'NOUNP'),
  (',', '.'),
  ('made', 'VERB'),
  ('drills', 'NOUN'),
  ('.', '.'),
  ('He', 'PRON'),
  ('big', 'ADJ'),
  (',', '.'),
  ('beefy', 'ADJ'),
  ('man', 'NOUN'),
  ('neck', 'NOUN'),
  (',', '.'),
  ('large', 'ADJ'),
  ('mustache', 'NOUN'),
  ('.'

In [90]:
# Only keep wanted tokens
tagged_sents_relevant = []
for ts in tagged_sentences_universal:
    tagged_sents_relevant += filter(lambda x: is_relevant_tag(x[1]), ts)

In [91]:
tagged_sents_relevant

[('Stone', 'NOUN'),
 ('number', 'NOUN'),
 ('proud', 'VERB'),
 ('perfectly', 'ADV'),
 ('normal', 'ADJ'),
 ('people', 'NOUN'),
 ("'d", 'VERB'),
 ('expect', 'VERB'),
 ('involved', 'ADJ'),
 ('strange', 'ADJ'),
 ('mysterious', 'NOUN'),
 ("n't", 'ADV'),
 ('hold', 'VERB'),
 ('nonsense', 'NOUN'),
 ('director', 'NOUN'),
 ('firm', 'NOUN'),
 ('called', 'VERB'),
 ('made', 'VERB'),
 ('drills', 'NOUN'),
 ('big', 'ADJ'),
 ('beefy', 'ADJ'),
 ('man', 'NOUN'),
 ('neck', 'NOUN'),
 ('large', 'ADJ'),
 ('mustache', 'NOUN'),
 ('thin', 'ADJ'),
 ('blonde', 'ADV'),
 ('usual', 'ADJ'),
 ('amount', 'NOUN'),
 ('neck', 'NOUN'),
 ('spent', 'ADJ'),
 ('time', 'NOUN'),
 ('craning', 'VERB'),
 ('garden', 'NOUN'),
 ('fences', 'NOUN'),
 ('spying', 'VERB'),
 ('neighbors', 'NOUN'),
 ('The', 'DET'),
 ('small', 'ADJ'),
 ('son', 'NOUN'),
 ('called', 'VERB'),
 ('opinion', 'NOUN'),
 ('finer', 'NOUN'),
 ('boy', 'NOUN'),
 ('The', 'DET'),
 ('wanted', 'VERB'),
 ('secret', 'ADJ'),
 ('greatest', 'ADJ'),
 ('fear', 'NOUN'),
 ('discover', 

In [92]:
# Normalise

# Lower
tagged_sents_relevant = list(map(lambda x: (x[0].lower(), x[1]), tagged_sents_relevant))


In [87]:
# Universal -> lemmatise tag mapper

In [95]:
# Lemmatise
lemmatiser = WordNetLemmatizer()
lemmatised_tokens = set()
for rt in tagged_sents_relevant:
    lemmatag = 'n'
    token, tag = rt
    if tag in tagmap_universal_lemmatizer:
        lemmatag = tagmap_universal_lemmatizer[tag]
    lt = lemmatiser.lemmatize(token, pos=lemmatag)
    lemmatised_tokens.add((lt, lemmatag, tag))

In [96]:
lemmatised_tokens

{('engrave', 'v', 'VERB'),
 ('sneeze', 'n', 'NOUN'),
 ('that', 'n', 'ADP'),
 ('aaaaaaaaaargh', 'n', 'NOUN'),
 ('hush', 'v', 'VERB'),
 ('damp', 'n', 'NOUN'),
 ('casually', 'r', 'ADV'),
 ('extra', 'a', 'ADJ'),
 ('whoop', 'n', 'NOUN'),
 ('unfold', 'v', 'VERB'),
 ('baggy', 'n', 'NOUN'),
 ('capture', 'n', 'NOUN'),
 ('ensnare', 'v', 'VERB'),
 ('crate', 'n', 'NOUN'),
 ('sign', 'a', 'ADJ'),
 ('awake', 'v', 'VERB'),
 ('friendly', 'a', 'ADJ'),
 ('noise', 'r', 'ADV'),
 ('toadless', 'a', 'ADJ'),
 ('or', 'n', 'CONJ'),
 ('powder', 'n', 'NOUN'),
 ('direct', 'a', 'ADJ'),
 ('wash', 'n', 'NOUN'),
 ('silly', 'r', 'ADV'),
 ('-it', 'v', 'VERB'),
 ('hung', 'a', 'ADJ'),
 ('plain', 'n', 'NOUN'),
 ('boa', 'n', 'NOUN'),
 ('brave', 'a', 'ADJ'),
 ('boil', 'n', 'NOUN'),
 ('ten', 'r', 'ADV'),
 ('cracker', 'n', 'NOUN'),
 ('famous', 'a', 'ADJ'),
 ('thinking', 'n', 'NOUN'),
 ('uncomfortable', 'a', 'ADJ'),
 ('offered', 'a', 'ADJ'),
 ('phoenix', 'v', 'VERB'),
 ('shred', 'v', 'VERB'),
 ('want', 'v', 'VERB'),
 ('ruddy', '