## По материалам курса "Вводный курс ML" (Дмитрий Макаров). Тема "Обработка естественного языка"

In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
corpus = 'When we were in Paris we visited a lot of museums. We first went to the Louvre, the largest art museum in the world. I have always been interested in art so I spent many hours there. The museum is enourmous, so a week there would not be enough.'

### Разделение на предложения

In [3]:
from nltk.tokenize import sent_tokenize

In [4]:
nltk.download('punkt')

sentences = sent_tokenize(corpus)
print(sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['When we were in Paris we visited a lot of museums.', 'We first went to the Louvre, the largest art museum in the world.', 'I have always been interested in art so I spent many hours there.', 'The museum is enourmous, so a week there would not be enough.']


### Разделение на слова

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
print(word_tokenize(sentences[0]))

['When', 'we', 'were', 'in', 'Paris', 'we', 'visited', 'a', 'lot', 'of', 'museums', '.']


In [7]:
tokens = []
 
for sentence in sentences:
    t = word_tokenize(sentence)
    tokens.extend(t)
 
print(tokens)

['When', 'we', 'were', 'in', 'Paris', 'we', 'visited', 'a', 'lot', 'of', 'museums', '.', 'We', 'first', 'went', 'to', 'the', 'Louvre', ',', 'the', 'largest', 'art', 'museum', 'in', 'the', 'world', '.', 'I', 'have', 'always', 'been', 'interested', 'in', 'art', 'so', 'I', 'spent', 'many', 'hours', 'there', '.', 'The', 'museum', 'is', 'enourmous', ',', 'so', 'a', 'week', 'there', 'would', 'not', 'be', 'enough', '.']


### Перевод в нижний регистр, удаление стоп-слов и знаков пунктуации

In [8]:
from nltk.corpus import stopwords

In [9]:
nltk.download('stopwords')

unique_stops = set(stopwords.words('english'))
 
no_stops = []
 
for token in tokens:
    token = token.lower()
    if token not in unique_stops and token.isalpha():
        no_stops.append(token)
 
print(no_stops)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['paris', 'visited', 'lot', 'museums', 'first', 'went', 'louvre', 'largest', 'art', 'museum', 'world', 'always', 'interested', 'art', 'spent', 'many', 'hours', 'museum', 'enourmous', 'week', 'would', 'enough']


### Лемматизация

In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
 
lemmatized = []
 
for token in no_stops:
    token = lemmatizer.lemmatize(token)
    lemmatized.append(token)
    
print(lemmatized)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
['paris', 'visited', 'lot', 'museum', 'first', 'went', 'louvre', 'largest', 'art', 'museum', 'world', 'always', 'interested', 'art', 'spent', 'many', 'hour', 'museum', 'enourmous', 'week', 'would', 'enough']


### Стемминг

In [12]:
from nltk.stem import PorterStemmer

In [13]:
porter = PorterStemmer()
stemmed_p = [porter.stem(s) for s in lemmatized]
print(stemmed_p)

['pari', 'visit', 'lot', 'museum', 'first', 'went', 'louvr', 'largest', 'art', 'museum', 'world', 'alway', 'interest', 'art', 'spent', 'mani', 'hour', 'museum', 'enourm', 'week', 'would', 'enough']


In [14]:
from nltk.stem import LancasterStemmer

In [15]:
lancaster = LancasterStemmer()
stemmed_l = [lancaster.stem(s) for s in lemmatized]
print(stemmed_l)

['par', 'visit', 'lot', 'muse', 'first', 'went', 'louvr', 'largest', 'art', 'muse', 'world', 'alway', 'interest', 'art', 'spent', 'many', 'hour', 'muse', 'enourm', 'week', 'would', 'enough']


### Мешок слов (bag of words, bow)

In [16]:
from collections import Counter

In [17]:
bow_counter = Counter(lemmatized)
print(bow_counter.most_common(10))

[('museum', 3), ('art', 2), ('paris', 1), ('visited', 1), ('lot', 1), ('first', 1), ('went', 1), ('louvre', 1), ('largest', 1), ('world', 1)]


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer(analyzer = "word", 
                             lowercase = True, 
                             tokenizer = None, 
                             preprocessor = None, 
                             stop_words = {'english'}, 
                             max_features = 5000)

In [20]:
bow_cv = vectorizer.fit_transform(sentences)

In [21]:
print(type(bow_cv))

<class 'scipy.sparse.csr.csr_matrix'>


In [22]:
print(bow_cv.toarray())

[[0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 2 0 0 1 1 0 0]
 [0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 3 0 1 0 1 0 1 0 0 1 0]
 [1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1]]


In [23]:
vocab = vectorizer.vocabulary_
print(vocab)
 
tokens = vectorizer.get_feature_names_out()
print(tokens)

{'when': 31, 'we': 27, 'were': 30, 'in': 9, 'paris': 20, 'visited': 26, 'lot': 13, 'of': 19, 'museums': 17, 'first': 6, 'went': 29, 'to': 25, 'the': 23, 'louvre': 14, 'largest': 12, 'art': 1, 'museum': 16, 'world': 32, 'have': 7, 'always': 0, 'been': 3, 'interested': 10, 'so': 21, 'spent': 22, 'many': 15, 'hours': 8, 'there': 24, 'is': 11, 'enourmous': 5, 'week': 28, 'would': 33, 'not': 18, 'be': 2, 'enough': 4}
['always' 'art' 'be' 'been' 'enough' 'enourmous' 'first' 'have' 'hours'
 'in' 'interested' 'is' 'largest' 'lot' 'louvre' 'many' 'museum' 'museums'
 'not' 'of' 'paris' 'so' 'spent' 'the' 'there' 'to' 'visited' 'we' 'week'
 'went' 'were' 'when' 'world' 'would']


In [24]:
index_list = []
 
for i, _ in enumerate(bow_cv):
 
    index_list.append(f'Sentence_{i}')
 
bow_cv_df = pd.DataFrame(data = bow_cv.toarray(), 
                         index = index_list, 
                         columns = tokens)

In [25]:
print(bow_cv_df)

            always  art  be  been  enough  ...  went  were  when  world  would
Sentence_0       0    0   0     0       0  ...     0     1     1      0      0
Sentence_1       0    1   0     0       0  ...     1     0     0      1      0
Sentence_2       1    1   0     1       0  ...     0     0     0      0      0
Sentence_3       0    0   1     0       1  ...     0     0     0      0      1

[4 rows x 34 columns]


### TF-IDF

Из Википедии "Если документ содержит 100 слов, и слово[3] «заяц» встречается в нём 3 раза, то частота слова (TF) для слова «заяц» в документе будет 0,03 (3/100). Вычислим IDF как десятичный логарифм отношения количества всех документов к количеству документов, содержащих слово «заяц». Таким образом, если «заяц» содержится в 1000 документах из 10 000 000 документов, то IDF будет равной: log(10 000 000/1000) = 4. Для расчета окончательного значения веса слова необходимо TF умножить на IDF. В данном примере, TF-IDF вес для слова «заяц» в выбранном документе будет равен: 0,03 × 4 = 0,12"

Логика формулы следующая, чем выше частота слова в документе (tf) и чем реже оно встречается в целом в документах (idf), тем выше общий показатель (tf-idf).

 ### IDF

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
tfidf_trans = TfidfTransformer(smooth_idf = True, use_idf = True)

In [28]:
tfidf_trans.fit(bow_cv)

TfidfTransformer()

In [29]:
df_idf = pd.DataFrame(tfidf_trans.idf_, index = tokens, columns = ["idf_weights"])

In [33]:
print(df_idf)

            idf_weights
always         1.916291
art            1.510826
be             1.916291
been           1.916291
enough         1.916291
enourmous      1.916291
first          1.916291
have           1.916291
hours          1.916291
in             1.223144
interested     1.916291
is             1.916291
largest        1.916291
lot            1.916291
louvre         1.916291
many           1.916291
museum         1.510826
museums        1.916291
not            1.916291
of             1.916291
paris          1.916291
so             1.510826
spent          1.916291
the            1.510826
there          1.510826
to             1.916291
visited        1.916291
we             1.510826
week           1.916291
went           1.916291
were           1.916291
when           1.916291
world          1.916291
would          1.916291


### TF x IDF

In [30]:
tf_idf_vector = tfidf_trans.transform(bow_cv)
tf_idf_vector

<4x34 sparse matrix of type '<class 'numpy.float64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [31]:
df_tfidf = pd.DataFrame(tf_idf_vector.toarray(), columns = vectorizer.get_feature_names_out())

In [32]:
print(df_tfidf.T)

                   0         1         2         3
always      0.000000  0.000000  0.328404  0.000000
art         0.000000  0.211724  0.258918  0.000000
be          0.000000  0.000000  0.000000  0.324676
been        0.000000  0.000000  0.328404  0.000000
enough      0.000000  0.000000  0.000000  0.324676
enourmous   0.000000  0.000000  0.000000  0.324676
first       0.000000  0.268544  0.000000  0.000000
have        0.000000  0.000000  0.328404  0.000000
hours       0.000000  0.000000  0.328404  0.000000
in          0.202925  0.171408  0.209616  0.000000
interested  0.000000  0.000000  0.328404  0.000000
is          0.000000  0.000000  0.000000  0.324676
largest     0.000000  0.268544  0.000000  0.000000
lot         0.317921  0.000000  0.000000  0.000000
louvre      0.000000  0.268544  0.000000  0.000000
many        0.000000  0.000000  0.328404  0.000000
museum      0.000000  0.211724  0.000000  0.255978
museums     0.317921  0.000000  0.000000  0.000000
not         0.000000  0.000000 

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfIdfVectorizer = TfidfVectorizer(use_idf = True, stop_words= 'english')

In [36]:
tfIdf = tfIdfVectorizer.fit_transform(sentences)
tfIdf

<4x15 sparse matrix of type '<class 'numpy.float64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [37]:
print(tfIdfVectorizer.get_feature_names_out())

['art' 'enourmous' 'hours' 'interested' 'largest' 'lot' 'louvre' 'museum'
 'museums' 'paris' 'spent' 'visited' 'week' 'went' 'world']


In [38]:
tfIdfVectorizer.idf_

array([1.51082562, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073])

In [39]:
df_idf = pd.DataFrame(tfIdfVectorizer.idf_, index = tfIdfVectorizer.get_feature_names_out(), columns = ['idf_weights'])

In [40]:
print(df_idf)

            idf_weights
art            1.510826
enourmous      1.916291
hours          1.916291
interested     1.916291
largest        1.916291
lot            1.916291
louvre         1.916291
museum         1.510826
museums        1.916291
paris          1.916291
spent          1.916291
visited        1.916291
week           1.916291
went           1.916291
world          1.916291
