# [NLP Intro](https://github.com/kokchun/Maskininlarning-AI21/blob/main/Lectures/Lec11-NLP_intro.ipynb)


In [1]:
import numpy as np


In [4]:
review1 = "I LOVE this book about love"
review2 = "No this book was okay"

all_words = [text.lower().split() for text in [review1, review2]]
all_words


[['i', 'love', 'this', 'book', 'about', 'love'],
 ['no', 'this', 'book', 'was', 'okay']]

In [7]:
all_words_flatten = [word for text in all_words for word in text]
all_words_flatten


['i',
 'love',
 'this',
 'book',
 'about',
 'love',
 'no',
 'this',
 'book',
 'was',
 'okay']

In [11]:
unique_words = set(all_words_flatten)
unique_words


{'about', 'book', 'i', 'love', 'no', 'okay', 'this', 'was'}

In [13]:
vocabulary = {word: index for index, word in enumerate(unique_words)}
vocabulary


{'was': 0,
 'this': 1,
 'no': 2,
 'okay': 3,
 'about': 4,
 'i': 5,
 'book': 6,
 'love': 7}

In [14]:
term_frequency = np.zeros(len(vocabulary))
term_frequency


array([0., 0., 0., 0., 0., 0., 0., 0.])

In [17]:
def term_frequency_vecorizer(document, vocabulary):
    term_frequency = np.zeros(len(vocabulary))
    for word in document.lower().split():
        index = vocabulary[word]
        term_frequency[index] += 1
    return term_frequency


review1_term_freq = term_frequency_vecorizer(review1, vocabulary)
review2_term_freq = term_frequency_vecorizer(review2, vocabulary)
print(review1_term_freq)
print(review2_term_freq)


[0. 1. 0. 0. 1. 1. 1. 2.]
[1. 1. 1. 1. 0. 0. 1. 0.]


In [18]:
import pandas as pd


In [20]:
bag_of_words = pd.DataFrame(
    [review1_term_freq, review2_term_freq], columns=vocabulary.keys(), dtype="int16"
)
bag_of_words


Unnamed: 0,was,this,no,okay,about,i,book,love
0,0,1,0,0,1,1,1,2
1,1,1,1,1,0,0,1,0


## Feature extraction with sklearn

In [29]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

count_vectorizer = CountVectorizer()
bag_of_words_sparse = count_vectorizer.fit_transform([review1, review2])
bag_of_words_sparse.todense(), count_vectorizer.get_feature_names_out()

(matrix([[1, 1, 2, 0, 0, 1, 0],
         [0, 1, 0, 1, 1, 1, 1]]),
 array(['about', 'book', 'love', 'no', 'okay', 'this', 'was'], dtype=object))

In [27]:
bag_of_words_df = pd.DataFrame(bag_of_words_sparse.todense(), columns=count_vectorizer.get_feature_names_out())
bag_of_words_df

## TF-IDF

In [31]:
tfidf_transformer = TfidfTransformer()
tfidf = tfidf_transformer.fit_transform(bag_of_words_sparse)
tfidf

<2x7 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [32]:
tfidf.todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])

In [33]:
tfidf_vecorizer = TfidfVectorizer()
tfidf_vecorizer.fit_transform([review1, review2]).todense()

matrix([[0.4078241 , 0.29017021, 0.81564821, 0.        , 0.        ,
         0.29017021, 0.        ],
        [0.        , 0.35520009, 0.        , 0.49922133, 0.49922133,
         0.35520009, 0.49922133]])