In [19]:
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag_sents, map_tag
from nltk.tokenize import word_tokenize

In [20]:
relevant_tags = {
    "CC": "Coordinating conjunction",
    "DT": "Determiner",
    "FW": "Foreign word",
    "IN": "Preposition or subordinating conjunction",
    "JJ": "Adjective",
    "JJR": "Comparative adjective",
    "JJS": "Superlative adjective",
    "MD": "Modal",
    "NN": "Noun",
    "NNS": "Plural noun",
    "PDT": "Predeterminer",
    "PRP": "Personal pronoun",
    "PRP$": "Possessive pronoun",
    "RB": "Adverb",
    "RBR": "Comparative adverb",
    "RBS": "Superlative adverb",
    "RP": "Particle",
    "UH": "Interjection",
    "VB": "Verb (base form)",
    "VBD": "Verb (past tense)",
    "VBG": "Verb (gerund or present participle)",
    "VBN": "Verb (past participle)",
    "VBP": "Verb (non-3rd person singular present)",
    "VBZ": "Verb (3rd person singular present)",
    "a": "Adjective",
    "n": "Noun",
    "r": "Adverb",
    "v": "verb",
}


In [21]:
# POS Pennstate tags to WordnetLemmatiser tags
pos_tags_map = {
    "CC": None,
    "DT": None,
    "FW": None,
    "IN": "a",
    "JJ": "a",
    "JJR": "a",
    "JJS": "a",
    "MD": None,
    "NN": "n",
    "NNS": "n",
    "PDT": None,
    "PRP": None,
    "PRP$": None,
    "RB": "r",
    "RBR": "r",
    "RBS": "r",
    "RP": None,
    "UH": None,
    "VB": "v",
    "VBD": "v",
    "VBG": "v",
    "VBN": "v",
    "VBP": "v",
    "VBZ": "v"
}

In [22]:
def is_relevant_token(token_tag_tuple) -> bool:
    _, tag = token_tag_tuple
    return tag in relevant_tags.keys()

In [23]:
with open('assets/dev-samples/harry-potter.content.txt', 'r') as f:
    contents = f.readlines()

In [24]:
# Tokenise
tokenised_sents = []
for c in contents:
    tokens = word_tokenize(c)
    if tokens:
        tokenised_sents.append(tokens)

In [31]:
# Tag
tagged_sentences = pos_tag_sents(tokenised_sents)

In [32]:
tagged_sentences

[[('Harry', 'NNP'),
  ('Potter', 'NNP'),
  ('and', 'CC'),
  ('the', 'DT'),
  ('Sorcerer', 'NNP'),
  ("'s", 'POS'),
  ('Stone', 'NN')],
 [('Mr.', 'NNP'),
  ('and', 'CC'),
  ('Mrs.', 'NNP'),
  ('Dursley', 'NNP'),
  (',', ','),
  ('of', 'IN'),
  ('number', 'NN'),
  ('four', 'CD'),
  (',', ','),
  ('Privet', 'NNP'),
  ('Drive', 'NNP'),
  (',', ','),
  ('were', 'VBD'),
  ('proud', 'JJ'),
  ('to', 'TO'),
  ('say', 'VB'),
  ('that', 'IN'),
  ('they', 'PRP'),
  ('were', 'VBD'),
  ('perfectly', 'RB'),
  ('normal', 'JJ'),
  (',', ','),
  ('thank', 'NN'),
  ('you', 'PRP'),
  ('very', 'RB'),
  ('much', 'RB'),
  ('.', '.'),
  ('They', 'PRP'),
  ('were', 'VBD'),
  ('the', 'DT'),
  ('last', 'JJ'),
  ('people', 'NNS'),
  ('you', 'PRP'),
  ("'d", 'MD'),
  ('expect', 'VB'),
  ('to', 'TO'),
  ('be', 'VB'),
  ('involved', 'VBN'),
  ('in', 'IN'),
  ('anything', 'NN'),
  ('strange', 'JJ'),
  ('or', 'CC'),
  ('mysterious', 'JJ'),
  (',', ','),
  ('because', 'IN'),
  ('they', 'PRP'),
  ('just', 'RB'),
  ('did

In [27]:
# Only keep wanted tokens
relevant_tokens = []
for ts in tagged_sentences:
    relevant_tokens += filter(lambda x: is_relevant_token(x), ts)

In [28]:
# Normalise

# Lower
relevant_tokens = list(map(lambda xy: (xy[0].lower(), xy[1]), relevant_tokens))


In [29]:
# Lemmatise
lemmatiser = WordNetLemmatizer()
lemmatised_tokens = set()
for rt in relevant_tokens:
    token, tag = rt
    lemmatag = pos_tags_map[tag]
    if lemmatag:
        lt = lemmatiser.lemmatize(token, pos=lemmatag)
        lemmatised_tokens.add((lt, lemmatag))
    else:
        lt = lemmatiser.lemmatize(token)
        lemmatised_tokens.add((lt, tag))

In [30]:
lemmatised_tokens

set()