# Section 3: Vector Models and Text Preprocessing

In [24]:
# Some key imports
import nltk
import scipy
import sklearn

# Download used nltk stuffs
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')


[nltk_data] Downloading package stopwords to /home/david/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/david/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/david/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/david/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/david/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/david/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

## Count Vectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This is the second document.',
    'What if this is the second document?',
    'I know I am not the first OR second document.',
]

vectorizer = CountVectorizer(lowercase=True, analyzer='word', stop_words='english')
# from nltk.corpus import stopwords
# vectorizer = CountVectorizer(lowercase=True, analyzer='word', stop_words=stopwords.words('english'))
vectorizer.fit(corpus)
display(sorted(vectorizer.vocabulary_, key=vectorizer.vocabulary_.__getitem__))
display(vectorizer.transform(corpus).toarray())


['document', 'know', 'second']

array([[1, 0, 0],
       [1, 0, 1],
       [1, 0, 1],
       [1, 1, 1]])

## Stemming and Lemmatization

In [21]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
assert (
    stemmer.stem('walking') == stemmer.stem('walks') == stemmer.stem('walked')
), 'the stemmer is buggy'


In [22]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
assert lemmatizer.lemmatize('mice') == 'mouse', 'the lemmatizer is buggy'
assert lemmatizer.lemmatize('going') == 'going', 'no part of speech specified'
assert lemmatizer.lemmatize('going', pos='v') == 'go', 'did not use verb rules'

In [26]:
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

pos_mapping = {'ADJ': 'a', 'ADV': 'r', 'NOUN': 'n', 'VERB': 'v'}
# from https://www.nltk.org/book/ch05.html
sentence = 'They are refusing to permit us to obtain the refuse permit'
tagged_tokens = pos_tag(word_tokenize(sentence), tagset='universal')
lemmatized = []
for token, pos in tagged_tokens:
    lemmatized.append(lemmatizer.lemmatize(token, pos=pos_mapping.get(pos, 'n')))
lemmatized


['They',
 'be',
 'refuse',
 'to',
 'permit',
 'u',
 'to',
 'obtain',
 'the',
 'refuse',
 'permit']

## Count Vectorizer
See [count-vectorizer.ipynb](count-vectorizer.ipynb).

## Recommender Exercise
See [recommender.ipynb](recommender.ipynb).

## Neural Word Embeddings
See [word-embeddings.ipynb](word-embeddings.ipynb).