# Class content

In [3]:
pip install nltk

Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
Collecting tqdm
  Downloading tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
Collecting regex>=2021.8.3
  Downloading regex-2021.11.10-cp38-cp38-win_amd64.whl (273 kB)
Installing collected packages: tqdm, regex, nltk
Successfully installed nltk-3.6.5 regex-2021.11.10 tqdm-4.62.3
Note: you may need to restart the kernel to use updated packages.


## Work tokeninzing

In [1]:
msg = 'Hey everyone! The party starts in 10mins. Be there ASAP!'
msg.split()

['Hey',
 'everyone!',
 'The',
 'party',
 'starts',
 'in',
 '10mins.',
 'Be',
 'there',
 'ASAP!']

In [2]:
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(msg)
tokenized_word

['Hey',
 'everyone',
 '!',
 'The',
 'party',
 'starts',
 'in',
 '10mins',
 '.',
 'Be',
 'there',
 'ASAP',
 '!']

## Stemming

In [4]:
from nltk.stem import PorterStemmer
porter = PorterStemmer()
porter.stem('running')


'run'

## Lemmatization

In [5]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
print(lem.lemmatize("running", pos = 'n'))
print(lem.lemmatize("running", pos = 'v'))


running
run


In [6]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
print(lem.lemmatize("better", pos = 'a' ) )
print(lem.lemmatize("ate", pos = 'v' ) )


good
eat


## Part of speech

In [8]:
import nltk
text = "Can you please buy me an Arizona Ice Tea? It is $9.99."

In [10]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(msg)
print("parts of the speech: " , nltk.pos_tag(tokens))

parts of the speech:  [('Hey', 'NNP'), ('everyone', 'NN'), ('!', '.'), ('The', 'DT'), ('party', 'NN'), ('starts', 'VBZ'), ('in', 'IN'), ('10mins', 'CD'), ('.', '.'), ('Be', 'VB'), ('there', 'EX'), ('ASAP', 'NNP'), ('!', '.')]


## Bag Of Words

In [11]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
text_data = np.array(['I love Brazil. Brazil!',
                     'Sweden is best',
                     'Germany beats both'])

In [13]:
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

In [16]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [19]:
count.get_feature_names_out()

array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

## TF-IDF

In [22]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
text_data = np.array(['l love Brazil. Brazil! '
'Sweden is best' ,
'Germany beats both' ])


In [25]:
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

In [27]:
feature_matrix.toarray()

array([[0.        , 0.35355339, 0.        , 0.70710678, 0.        ,
        0.35355339, 0.35355339, 0.35355339],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [28]:
tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}