## Importing libraries and dataset

In [75]:
import os
import nltk
import nltk.corpus
from nltk.corpus import webtext

nltk.download('webtext')
plain_text = webtext.raw()

print(plain_text[:200])

Cookie Manager: "Don't allow sites that set removed cookies to set future cookies" should stay checked
When in full screen mode
Pressing Ctrl-N should open a new browser when only download dialog is


[nltk_data] Downloading package webtext to
[nltk_data]     /Users/partnadem/nltk_data...
[nltk_data]   Package webtext is already up-to-date!


## Tokenizing plain text

In [76]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

raw_token_list = word_tokenize(plain_text, 'en', True)
print(raw_token_list[:10], len(raw_token_list))

['Cookie', 'Manager', ':', '``', 'Do', "n't", 'allow', 'sites', 'that', 'set'] 366313


## Deleting stop words

In [77]:
from nltk.corpus import stopwords

nltk.download('stopwords')

forbidden_words = stopwords.words('english')

cleared_from_stopwords_tokens = list()

for token in raw_token_list:
    if token not in forbidden_words:
        cleared_from_stopwords_tokens.append(token)

print(cleared_from_stopwords_tokens[:10])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/partnadem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['Cookie', 'Manager', ':', '``', 'Do', "n't", 'allow', 'sites', 'set', 'removed']


## Normalizing tokens

In [78]:
import re

special_character_regex = '[^a-zA-Z0-9]+'

normalized_token_list = list()

for token in cleared_from_stopwords_tokens:
    is_forbidden_word = bool(re.search(special_character_regex, token))

    if is_forbidden_word:
        continue

    normalized_token_list.append(token.lower())

print(normalized_token_list[:30])

['cookie', 'manager', 'do', 'allow', 'sites', 'set', 'removed', 'cookies', 'set', 'future', 'cookies', 'stay', 'checked', 'when', 'full', 'screen', 'mode', 'pressing', 'open', 'new', 'browser', 'download', 'dialog', 'left', 'open', 'add', 'icons', 'context', 'menu', 'so']


## Checking frequency of tokens

In [79]:
frequency_map = FreqDist()

for token in normalized_token_list:
    frequency_map[token.lower()]+=1

frequency_map

FreqDist({'i': 7803, 'girl': 2938, 'guy': 2725, '1': 2091, 'like': 1654, '2': 1647, 'you': 1303, 'man': 984, 'woman': 979, 'know': 906, ...})

## Stemming

In [80]:
from nltk.stem import PorterStemmer

porter_stemmer = PorterStemmer()

stemmed_tokens = list()

for token in normalized_token_list:
    stemmed_tokens.append(porter_stemmer.stem(token))

print('normalized', normalized_token_list[:10])
print("stemmed", stemmed_tokens[:10])

normalized ['cookie', 'manager', 'do', 'allow', 'sites', 'set', 'removed', 'cookies', 'set', 'future']
stemmed ['cooki', 'manag', 'do', 'allow', 'site', 'set', 'remov', 'cooki', 'set', 'futur']


## Lemmatization

In [81]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = list()

for token in normalized_token_list:
    lemmatized_tokens.append(lemmatizer.lemmatize(token))

print("normalized", normalized_token_list[:10])
print("lemmatized", lemmatized_tokens[:10])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/partnadem/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


normalized ['cookie', 'manager', 'do', 'allow', 'sites', 'set', 'removed', 'cookies', 'set', 'future']
lemmatized ['cookie', 'manager', 'do', 'allow', 'site', 'set', 'removed', 'cooky', 'set', 'future']
