<a href="https://colab.research.google.com/github/germanjke/Deep_Learning_School_MIPT/blob/master/NLP_seminars/%5Bseminar%5Dtext_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Предобработка текста

In [1]:
import nltk
import spacy
import re

### Токенизация

In [3]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens)

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [5]:
print(sent_tokenize("I was going home when she rung. It was a surprise."))

['I was going home when she rung.', 'It was a surprise.']


### Удаление неинформативных слов

#### N-граммы

<img src="https://res.cloudinary.com/practicaldev/image/fetch/s--466CQV1q--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_66%2Cw_880/https://thepracticaldev.s3.amazonaws.com/i/78nf1vryed8h1tz05fim.gif" height=400>

In [11]:
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
print(unigram[:5])
print(bigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]


In [12]:
from nltk import FreqDist
print('Популярные униграммы: ', FreqDist(unigram).most_common(5))
print('Популярные биграммы: ', FreqDist(bigram).most_common(5))

Популярные униграммы:  [(('all',), 2), (('work',), 2), (('and',), 2), (('no',), 2), (('play',), 2)]
Популярные биграммы:  [(('all', 'work'), 2), (('work', 'and'), 2), (('and', 'no'), 2), (('no', 'play'), 2), (('play', 'makes'), 1)]


#### Стоп-слова

In [13]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
stopWords = set(stopwords.words('english'))
print(stopWords)

{'shan', 'hadn', 'she', 'very', 'mightn', 'than', 'hasn', "you'll", 'above', "aren't", 'most', 'a', 'if', 'into', 'while', 'out', 'our', 'will', 'under', 'haven', "mightn't", 'd', 'doing', 'which', "that'll", 'until', 'too', 'off', 'to', 'o', 'me', 'during', 'whom', 'between', 'himself', "she's", 'then', 'weren', 'themselves', 'he', 'yours', 'doesn', 'be', 'being', 'over', 'but', "you'd", 'with', 'any', "wasn't", 'its', 'll', 'only', 'aren', 'i', 'been', 'below', 'theirs', 'what', 'ma', 'on', 't', 'against', 'don', 'won', 'by', 'no', "doesn't", 'should', 'of', 'the', "didn't", 'about', 'having', 'through', 'do', 'an', 'ain', 'my', 'his', "haven't", 'didn', "shan't", 'in', 'does', 'who', 'each', 'her', 'was', 'more', "needn't", "weren't", 'him', 'how', 'their', "mustn't", "you're", 'are', 'or', 'other', "don't", 'you', 'once', 'is', 'have', 'same', 'shouldn', 'nor', "shouldn't", 'where', 'were', 'wasn', 'isn', 'y', 'here', 'for', 'these', 'at', 'them', 'not', "hasn't", 'herself', 'those

In [15]:
print([word for word in tokens if word not in stopWords])

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


### Стемминг
* процесс нахождения основы слова для заданного исходного слова

In [16]:
from nltk.stem import PorterStemmer, SnowballStemmer
words = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ['корова', 'мальчики', 'мужчины', 'столом', 'убежала']

In [17]:
ps = PorterStemmer()
list(map(ps.stem, words))

['game', 'game', 'game', 'game', 'compact']

In [18]:
ss = SnowballStemmer(language='russian')
list(map(ss.stem, words_ru))

['коров', 'мальчик', 'мужчин', 'стол', 'убежа']

### Лематизация
* процесс приведения словоформы к лемме — её нормальной (словарной) форме

In [19]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from

a mandate from the masses, not from some farcical aquatic ceremony."""

In [20]:
nlp = spacy.load('en')
doc = nlp(raw)
print(' '.join([token.lemma_ for token in doc]))

denni : listen , strange woman lie in pond distribute sword 
 be no basis for a system of government .   Supreme executive power derive from 

 a mandate from the masse , not from some farcical aquatic ceremony .


### Part-of-Speech

In [21]:
[(token.lemma_, token.pos_) for token in doc[:7]]

[('denni', 'NOUN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

In [22]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [23]:
sentences = nltk.sent_tokenize(data)   
for sent in sentences:
    print(nltk.pos_tag(sent.split()))

[('All', 'DT'), ('work', 'NN'), ('and', 'CC'), ('no', 'DT'), ('play', 'NN'), ('makes', 'VBZ'), ('jack', 'RP'), ('a', 'DT'), ('dull', 'JJ'), ('boy,', 'NN'), ('all', 'DT'), ('work', 'NN'), ('and', 'CC'), ('no', 'DT'), ('play', 'NN')]


In [24]:
nltk.download('universal_tagset')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


True

In [25]:
word_tag = nltk.pos_tag(sent.split())
new_word_tag = [(word, nltk.map_tag('en-ptb', 'universal', tag)) for word, tag in word_tag]
print(new_word_tag)

[('All', 'DET'), ('work', 'NOUN'), ('and', 'CONJ'), ('no', 'DET'), ('play', 'NOUN'), ('makes', 'VERB'), ('jack', 'PRT'), ('a', 'DET'), ('dull', 'ADJ'), ('boy,', 'NOUN'), ('all', 'DET'), ('work', 'NOUN'), ('and', 'CONJ'), ('no', 'DET'), ('play', 'NOUN')]


### Named entities recognition

In [26]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


### Поиск шаблонов

#### Регулярные выражения

Исчерпывающий пост https://habr.com/ru/post/349860/

In [27]:
word = 'supercalifragilisticexpialidocious'
re.findall('[aeiou]|super', word)

['super', 'a', 'i', 'a', 'i', 'i', 'i', 'e', 'i', 'a', 'i', 'o', 'i', 'o', 'u']

In [28]:
re.findall('\d{1,2}', 'There is some numbers: 49 and 432')

['49', '43', '2']

In [29]:
re.sub('[,\.?!]','','How, to? split. text!')

'How to split text'

In [30]:
re.sub('[^A-z]',' ','I 123 can 45 play 67 football').split()

['I', 'can', 'play', 'football']

### Задача классификации

#### 20 newsgroups
Датасет с 18000 новостей, сгруппированных по 20 темам.

In [31]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [32]:
list(newsgroups_train.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [33]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((11314,), (11314,))

#### Рассмотрим подвыборку

In [34]:
cats = ['alt.atheism', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

In [35]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((1073,), (1073,))

In [36]:
print(newsgroups_train.data[0])

From: bil@okcforum.osrhe.edu (Bill Conner)
Subject: Re: Not the Omni!
Nntp-Posting-Host: okcforum.osrhe.edu
Organization: Okcforum Unix Users Group
X-Newsreader: TIN [version 1.1 PL6]
Lines: 18

Charley Wingate (mangoe@cs.umd.edu) wrote:
: 
: >> Please enlighten me.  How is omnipotence contradictory?
: 
: >By definition, all that can occur in the universe is governed by the rules
: >of nature. Thus god cannot break them. Anything that god does must be allowed
: >in the rules somewhere. Therefore, omnipotence CANNOT exist! It contradicts
: >the rules of nature.
: 
: Obviously, an omnipotent god can change the rules.

When you say, "By definition", what exactly is being defined;
certainly not omnipotence. You seem to be saying that the "rules of
nature" are pre-existant somehow, that they not only define nature but
actually cause it. If that's what you mean I'd like to hear your
further thoughts on the question.

Bill



In [37]:
newsgroups_train.target[:10]

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

#### TF-IDF(напоминание)

$n_{\mathbb{d}\mathbb{w}}$ - число вхождений слова $\mathbb{w}$ в документ $\mathbb{d}$;<br>
$N_{\mathbb{w}}$ - число документов, содержащих $\mathbb{w}$;<br>
$N$ - число документов; <br><br>

$p(\mathbb{w}, \mathbb{d}) = N_{\mathbb{w}} / N$ - вероятность наличия слова $\mathbb{w}$ в любом документе $\mathbb{d}$
<br>
$P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}}) = (N_{\mathbb{w}} / N)^{n_{\mathbb{d}\mathbb{w}}}$ - вероятность встретить $n_{\mathbb{d}\mathbb{w}}$ раз слово $\mathbb{w}$ в документе $\mathbb{d}$<br><br>

$-\log{P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}})} = n_{\mathbb{d}\mathbb{w}} \cdot \log{(N / N_{\mathbb{w}})} = TF(\mathbb{w}, \mathbb{d}) \cdot IDF(\mathbb{w})$<br><br>

$TF(\mathbb{w}, \mathbb{d}) = n_{\mathbb{d}\mathbb{w}}$ - term frequency;<br>
$IDF(\mathbb{w}) = \log{(N /N_{\mathbb{w}})}$ - inverted document frequency;

#### Давайте векторизуем эти тексты с помощью TF-IDF

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)

#### Некоторые параметры TfidfVectorizer: 
##### input : string {‘filename’, ‘file’, ‘content’}
##### lowercase : boolean, default True
##### preprocessor : callable or None (default)
##### tokenizer : callable or None (default)
##### stop_words : string {‘english’}, list, or None (default)
##### ngram_range : tuple (min_n, max_n)
##### max_df : float in range [0.0, 1.0] or int, default=1.0
##### min_df : float in range [0.0, 1.0] or int, default=1
##### max_features : int or None, default=None

#### Перебор параметров

In [39]:
# lowercase
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [40]:
vectorizer = TfidfVectorizer(lowercase=False)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 42307)

In [41]:
vectorizer.get_feature_names()[:10]

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000005102000',
 '000021',
 '000062David42',
 '0000VEC',
 '0001']

In [42]:
# min_df, max_df
vectorizer = TfidfVectorizer(min_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 9)

In [43]:
vectorizer.get_feature_names()

['and', 'from', 'in', 'lines', 'of', 'organization', 'subject', 'the', 'to']

In [44]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 2391)

In [45]:
# ngram_range
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.03, max_df=0.9)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 1236)

In [46]:
# стоп-слова, preproc
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [47]:
def preproc1(text):
    return ' '.join([wnl.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stopWords])

In [48]:
st = "Oh, I think I ve landed Where there are miracles at work,  For the thirst and for the hunger Come the conference of birds"
preproc1(st)

'oh , think landed miracle work , thirst hunger come conference bird'

In [49]:
%%time
vectorizer = TfidfVectorizer(preprocessor=preproc1)
vectors = vectorizer.fit_transform(newsgroups_train.data)

CPU times: user 8.47 s, sys: 5.07 ms, total: 8.47 s
Wall time: 8.57 s


In [50]:
vectors.shape

(2034, 31719)

In [51]:
def preproc2(text):
    return ' '.join([token.lemma_ for token in nlp(text.lower()) if token.lemma_ not in stopWords])

In [52]:
preproc2(st)

'oh , think land miracle work ,   thirst hunger come conference bird'

In [53]:
%%time
vectorizer = TfidfVectorizer(preprocessor=preproc2)
vectors = vectorizer.fit_transform(newsgroups_train.data)

CPU times: user 2min 19s, sys: 4.89 s, total: 2min 24s
Wall time: 2min 26s


In [54]:
vectors.shape

(2034, 29192)

#### Итоговая модель

In [55]:
vectorizer = TfidfVectorizer(preprocessor=preproc1, ngram_range=(1, 3), max_df=0.5, max_features=1000)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 1000)

In [56]:
vectorizer.get_feature_names()[::100]

['00',
 'archive',
 'color',
 'edu keith allan',
 'hand',
 'later',
 'near',
 'principle',
 'shall',
 'tony']

#### Можем посмотреть на косинусную меру между векторами

In [57]:
vector = vectors.todense()[1]

In [58]:
(vector != 0).sum()

37

In [59]:
import numpy as np
from numpy.linalg import norm

In [60]:
type(vectors)

scipy.sparse.csr.csr_matrix

In [61]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

In [62]:
np.unique(newsgroups_train.target)

array([0, 1, 2, 3])

In [63]:
dense_vectors = vectors.todense()

In [64]:
dense_vectors.shape

(2034, 1000)

In [65]:
def cosine_sim(v1, v2):
    # v1, v2 (1 x dim)
    return np.array(v1 @ v2.T / norm(v1) / norm(v2))[0][0]

In [66]:
cosine_sim(dense_vectors[0], dense_vectors[0])

1.0000000000000002

In [67]:
cosines = []
for i in range(10):
    cosines.append(cosine_sim(dense_vectors[0], dense_vectors[i]))

In [68]:
# [1, 3, 2, 0, 2, 0, 2, 1, 2, 1]
cosines

[1.0000000000000002,
 0.012831116618462843,
 0.0,
 0.02901411919074219,
 0.05098356842157866,
 0.013365054451950215,
 0.029853253009777425,
 0.2247345611573304,
 0.024458521266886704,
 0.020883275103595455]

#### Обучим любую известную модель на полученных признаках

In [69]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import SGDClassifier

In [70]:
X_train, X_test, y_train, y_test= train_test_split(dense_vectors, newsgroups_train.target, test_size=0.2, random_state=0)

In [71]:
y_train.shape, y_test.shape

((1627,), (407,))

In [72]:
%%time
svc = svm.SVC()
svc.fit(X_train, y_train)

CPU times: user 5.44 s, sys: 5.09 ms, total: 5.45 s
Wall time: 5.49 s


In [73]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, svc.predict(X_test))

0.9361179361179361

In [74]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
accuracy_score(y_test, sgd.predict(X_test))

0.9041769041769042

### Byte Pair Encoding

<img src="https://alexanderdyakonov.files.wordpress.com/2019/11/bpe.jpg">

#### Реализация

In [75]:
!pip install youtokentome

Collecting youtokentome
[?25l  Downloading https://files.pythonhosted.org/packages/a3/65/4a86cf99da3f680497ae132329025b291e2fda22327e8da6a9476e51acb1/youtokentome-1.0.6-cp36-cp36m-manylinux2010_x86_64.whl (1.7MB)
[K     |████████████████████████████████| 1.7MB 4.4MB/s 
Installing collected packages: youtokentome
Successfully installed youtokentome-1.0.6


In [76]:
import youtokentome as yttm

def train_bpe(records, preproc, model_path, model_type="bpe", vocab_size=10000, lower=True):
    temp_file_name = "temp.txt"
    with open(temp_file_name, "w") as temp:
        for text in records:
            temp.write(preproc(text) + "\n")

    yttm.BPE.train(data=temp_file_name, vocab_size=vocab_size, model=model_path)

train_bpe(records=newsgroups_train.data, preproc=preproc1, model_path="BPE_model.bin")

In [77]:
bpe_processor = yttm.BPE('BPE_model.bin')
bpe_processor.vocab()[::1000]

['<PAD>',
 '▁dep',
 '.g',
 '▁thread',
 '82',
 'abit',
 'ori',
 '▁ppm',
 '.ins.cwru.edu',
 '▁well.sf.ca.us']

In [78]:
print(newsgroups_train.data[0])

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



In [79]:
''.join(bpe_processor.encode(preproc1(newsgroups_train.data[0]), output_type=yttm.OutputType.SUBWORD))



In [80]:
def our_tokenize(text):
    preproc_text = preproc1(text)
    return bpe_processor.encode(preproc_text, output_type=yttm.OutputType.SUBWORD)

In [81]:
vectorizer = TfidfVectorizer(tokenizer=our_tokenize, ngram_range=(1, 3), min_df=0.3, max_features=3000)
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [82]:
vectorizer.get_feature_names()[::5]

['ed',
 "▁'s",
 '▁) ▁writes',
 '▁.',
 '▁: ▁>',
 '▁@',
 '▁like',
 '▁nntp-posting-host ▁:',
 '▁subject ▁:',
 '▁writes ▁:']

In [83]:
vectorizer = TfidfVectorizer(tokenizer=our_tokenize, ngram_range=(1, 3), max_df=0.3, max_features=3000)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 3000)

In [84]:
vectorizer.get_feature_names()[::100]

["'",
 '2 ▁@',
 'ak',
 'er ▁,',
 'ized',
 'pr',
 'ved',
 '▁) ▁,',
 '▁, ▁still',
 '▁. ▁| ▁>',
 '▁: ▁20',
 '▁> ▁}',
 '▁aerospace',
 '▁attempt',
 '▁capability',
 '▁contradiction',
 '▁disk',
 '▁fairly',
 '▁god ▁.',
 '▁in-reply-to ▁:',
 '▁kmr4',
 '▁lunar',
 '▁must',
 '▁output',
 '▁present',
 '▁report',
 '▁sens',
 '▁state',
 '▁titan',
 '▁w']

In [85]:
vector = vectors.todense()[1]

In [86]:
(vector != 0).sum()

113

In [87]:
dense_vectors = vectors.todense()
X_train, X_test, y_train, y_test= train_test_split(dense_vectors, newsgroups_train.target, test_size=0.2, random_state=0)

In [88]:
y_train.shape, y_test.shape

((1627,), (407,))

In [89]:
%%time
svc = svm.SVC()
svc.fit(X_train, y_train)

CPU times: user 19.3 s, sys: 27.3 ms, total: 19.4 s
Wall time: 19.5 s


In [90]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, svc.predict(X_test))

0.8845208845208845

In [91]:
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
accuracy_score(y_test, sgd.predict(X_test))

0.914004914004914