# Предобработка текста

In [1]:
!python --version

Python 3.10.12


In [2]:
import gensim
gensim.__version__

'4.3.2'

## Часть 1

### Токенизация

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
data = "All work and no play makes jack a dull boy, all work and no play"
tokens = word_tokenize(data.lower())
print(tokens)

['all', 'work', 'and', 'no', 'play', 'makes', 'jack', 'a', 'dull', 'boy', ',', 'all', 'work', 'and', 'no', 'play']


In [5]:
print(sent_tokenize("I was going home when she rung. It was a surprise."))

['I was going home when she rung.', 'It was a surprise.']


[<img src="https://raw.githubusercontent.com/natasha/natasha-logos/master/natasha.svg">](https://github.com/natasha/natasha)

[Razdel](https://natasha.github.io/razdel/)

In [6]:
!pip install -q razdel

In [7]:
from razdel import tokenize, sentenize
text = 'Кружка-термос на 0.5л (50/64 см³, 516;...)'
list(tokenize(text))

[Substring(0, 13, 'Кружка-термос'),
 Substring(14, 16, 'на'),
 Substring(17, 20, '0.5'),
 Substring(20, 21, 'л'),
 Substring(22, 23, '('),
 Substring(23, 28, '50/64'),
 Substring(29, 32, 'см³'),
 Substring(32, 33, ','),
 Substring(34, 37, '516'),
 Substring(37, 38, ';'),
 Substring(38, 41, '...'),
 Substring(41, 42, ')')]

#### Регулярные выражения

Исчерпывающий пост https://habr.com/ru/post/349860/

In [8]:
import re
word = 'supercalifragilisticexpialidocious'
re.findall('[abc]|up|super', word)

## findall - поиск
### При нахождении любого из символов "abc", или слово up, или слово super. Порядок важен!!!

['super', 'c', 'a', 'a', 'c', 'a', 'c']

In [9]:
re.findall('\d{1,3}', 'These are some numbers: 49 and 432312')

### \ означает, что мы ищем только d{от 1го до 3х значных} чисел

['49', '432', '312']

In [10]:
re.sub('[,\.?!]','','How, to? split. text!')

## sub - удаление

'How to split text'

In [11]:
re.sub('[^A-z]',' ','I 123 can 45 play 67 football').split()

### ^ - все кроме

['I', 'can', 'play', 'football']

### Удаление неинформативных слов

#### N-граммы

<img src="https://res.cloudinary.com/practicaldev/image/fetch/s--466CQV1q--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_66%2Cw_880/https://thepracticaldev.s3.amazonaws.com/i/78nf1vryed8h1tz05fim.gif" height=400>

In [12]:
unigram = list(nltk.ngrams(tokens, 1))
bigram = list(nltk.ngrams(tokens, 2))
print(unigram[:5])
print(bigram[:5])

[('all',), ('work',), ('and',), ('no',), ('play',)]
[('all', 'work'), ('work', 'and'), ('and', 'no'), ('no', 'play'), ('play', 'makes')]


In [13]:
from nltk import FreqDist

### FreqDist -  можем посмотреть как часто встречаются в тексте

print('Популярные униграммы: ', FreqDist(unigram).most_common(5))
print('Популярные биграммы: ', FreqDist(bigram).most_common(5))

Популярные униграммы:  [(('all',), 2), (('work',), 2), (('and',), 2), (('no',), 2), (('play',), 2)]
Популярные биграммы:  [(('all', 'work'), 2), (('work', 'and'), 2), (('and', 'no'), 2), (('no', 'play'), 2), (('play', 'makes'), 1)]


#### Стоп-слова

In [14]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
stopWords = set(stopwords.words('english'))
print(stopWords)

{'if', 'and', 'few', 'can', 'hasn', "doesn't", "she's", 'only', "should've", 'doesn', 'd', 'not', "couldn't", 'be', 'how', 'ain', 'are', "hasn't", 'for', 'then', 't', 'nor', 'shan', "aren't", 'my', 'aren', 'to', 'they', 'having', 'more', 'his', 's', 've', 'them', 'y', 'from', 'did', 'under', 'all', 'of', 'needn', 'was', 'whom', 'as', 'in', 'don', 'there', 'over', "didn't", "needn't", 'hers', 'after', 'haven', "you've", 'off', 'is', 'ma', 'our', 'the', 'just', 'm', 'wasn', 'when', 'into', 're', 'doing', 'than', 'being', 'once', 'their', 'has', "wouldn't", 'each', 'by', 'because', 'shouldn', 'no', "mightn't", 'her', 'why', 'most', 'between', 'here', 'me', 'again', 'it', 'your', 'theirs', 'but', 'other', "hadn't", 'am', 'isn', 'down', 'himself', 'such', 'while', "don't", 'where', 'wouldn', 'through', 'ours', 'weren', 'about', 'with', 'its', 'do', 'been', 'i', 'a', 'yourself', 'herself', 'on', 'too', 'we', 'own', "you'd", 'further', "shan't", 'before', "it's", 'or', 'very', 'myself', 'whic

In [16]:
print([word for word in tokens if word not in stopWords])

### Убрались or и no - однако надо быть аккуратнее! Исходя из задачи нужно понимать какие слова мы можем выкинуть

['work', 'play', 'makes', 'jack', 'dull', 'boy', ',', 'work', 'play']


In [17]:
import string

# Можем также удалить и пунктуацию

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


#### Стемминг vs Лемматизация
* ‘Caring’ -> Лемматизация -> ‘Care’ - приведение токена к начальной форме
* ‘Caring’ -> Стемминг -> ‘Car’ - сокращение токена до какого то количества символов

### Стемминг
* процесс нахождения основы слова для заданного исходного слова

In [18]:
from nltk.stem import PorterStemmer, SnowballStemmer
words = ["game", "gaming", "gamed", "games", "compacted"]
words_ru = ['корова', 'мальчики', 'мужчины', 'столом', 'убежала']

In [19]:
ps = PorterStemmer()
list(map(ps.stem, words))

['game', 'game', 'game', 'game', 'compact']

In [20]:
ss = SnowballStemmer(language='russian')
list(map(ss.stem, words_ru))

['коров', 'мальчик', 'мужчин', 'стол', 'убежа']

### Лематизация
* процесс приведения словоформы к лемме — её нормальной (словарной) форме

In [21]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
is no basis for a system of government.  Supreme executive power derives from
a mandate from the masses, not from some farcical aquatic ceremony."""

raw_ru = """Не существует научных доказательств в пользу эффективности НЛП, оно
признано псевдонаукой. Систематические обзоры указывают, что НЛП основано на
устаревших представлениях об устройстве мозга, несовместимо с современной
неврологией и содержит ряд фактических ошибок."""

In [22]:
!pip install -q pymorphy2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [23]:
# 1
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
pymorphy_results = list(map(lambda x: morph.parse(x), raw_ru.split(' ')))
print(' '.join([res[0].normal_form for res in pymorphy_results]))

 ### Сначала бьем предложение на токены, потом приводим к начальной форме (лемматизация)

не существовать научный доказательство в польза эффективность нлп, оно
признать псевдонаукой. систематический обзор указывают, что нлп основать на
устаревший представление о устройство мозга, несовместимый с современной
неврология и содержать ряд фактический ошибок.


In [24]:
pymorphy_results[2]

[Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,gent'), normal_form='научный', score=0.774193, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 21),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual plur,loct'), normal_form='научный', score=0.209677, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 26),)),
 Parse(word='научных', tag=OpencorporaTag('ADJF,Qual anim,plur,accs'), normal_form='научный', score=0.016129, methods_stack=((DictionaryAnalyzer(), 'научных', 12, 23),))]

In [25]:
# 2 - Нейросетевой подход, однако может работать только с английским языком
import spacy
nlp = spacy.load('en_core_web_sm')
spacy_results = nlp(raw)
print(' '.join([token.lemma_ for token in spacy_results]))

DENNIS : listen , strange woman lie in pond distribute sword 
 be no basis for a system of government .   Supreme executive power derive from 
 a mandate from the masse , not from some farcical aquatic ceremony .


[Сравнение PyMorphy2 и PyMystem3](https://habr.com/ru/post/503420/)

### Part-of-Speech - опреледение части речи

In [26]:
# 1

[(res[0].normal_form, res[0].tag) for res in pymorphy_results[:9]]

### Есть части речи и другие параметры

[('не', OpencorporaTag('PRCL')),
 ('существовать', OpencorporaTag('VERB,impf,intr sing,3per,pres,indc')),
 ('научный', OpencorporaTag('ADJF,Qual plur,gent')),
 ('доказательство', OpencorporaTag('NOUN,inan,neut plur,gent')),
 ('в', OpencorporaTag('PREP')),
 ('польза', OpencorporaTag('NOUN,inan,femn sing,accs')),
 ('эффективность', OpencorporaTag('NOUN,inan,femn sing,gent')),
 ('нлп,', OpencorporaTag('UNKN')),
 ('оно\nпризнать', OpencorporaTag('PRTS,perf,past,pssv neut,sing'))]

In [27]:
# 2

[(token.lemma_, token.pos_) for token in spacy_results[:7]]

### Есть части речи

[('DENNIS', 'PROPN'),
 (':', 'PUNCT'),
 ('listen', 'VERB'),
 (',', 'PUNCT'),
 ('strange', 'ADJ'),
 ('woman', 'NOUN'),
 ('lie', 'VERB')]

In [28]:
!pip install -q rnnmorph

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.7/19.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rnnmorph (setup.py) ... [?25l[?25hdone
  Building wheel for russian-tagsets (setup.py) ... [?25l[?25hdone


In [29]:
#!pip install numpy==1.20.0

In [30]:
import numpy as np
print(np.__version__)

1.25.2


In [31]:
# 3 - Замечательно работает с русским языком (нейросетевой подход)
### Очень классный современный подход на RNN

from rnnmorph.predictor import RNNMorphPredictor
predictor = RNNMorphPredictor(language="ru")
#rnnmorph_result = predictor.predict(raw_ru.split(' '))
#[(token.normal_form, token.pos, token.tag) for token in rnnmorph_result[:7]] - ломается, так как нужно numpy==1.20.0



In [32]:
### Нужно откатиться до 1.20.0

### Named entities recognition

In [33]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

### Нужно понимать, что для NER приведение к нижниму регистру может сыграть плохую шутку

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


## Часть 2

### Задача классификации

#### 20 newsgroups
Датасет с 18000 новостей, сгруппированных по 20 темам.

In [34]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [35]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [36]:
newsgroups_train.filenames.shape

(11314,)

#### Рассмотрим подвыборку

In [37]:
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
newsgroups_train.filenames.shape

(2034,)

In [38]:
print(newsgroups_train.data[0])

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



In [39]:
newsgroups_train.target[:10]

array([1, 3, 2, 0, 2, 0, 2, 1, 2, 1])

#### TF-IDF(напоминание)

$n_{\mathbb{d}\mathbb{w}}$ - число вхождений слова $\mathbb{w}$ в документ $\mathbb{d}$;<br>
$N_{\mathbb{w}}$ - число документов, содержащих $\mathbb{w}$;<br>
$N$ - число документов; <br><br>

$p(\mathbb{w}, \mathbb{d}) = N_{\mathbb{w}} / N$ - вероятность наличия слова $\mathbb{w}$ в любом документе $\mathbb{d}$
<br>
$P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}}) = (N_{\mathbb{w}} / N)^{n_{\mathbb{d}\mathbb{w}}}$ - вероятность встретить $n_{\mathbb{d}\mathbb{w}}$ раз слово $\mathbb{w}$ в документе $\mathbb{d}$<br><br>

$-\log{P(\mathbb{w}, \mathbb{d}, n_{\mathbb{d}\mathbb{w}})} = n_{\mathbb{d}\mathbb{w}} \cdot \log{(N / N_{\mathbb{w}})} = TF(\mathbb{w}, \mathbb{d}) \cdot IDF(\mathbb{w})$<br><br>

$TF(\mathbb{w}, \mathbb{d}) = n_{\mathbb{d}\mathbb{w}}$ - term frequency;<br>
$IDF(\mathbb{w}) = \log{(N /N_{\mathbb{w}})}$ - inverted document frequency;

#### Давайте векторизуем эти тексты с помощью TF-IDF

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

#### Некоторые параметры:
* input : string {‘filename’, ‘file’, ‘content’}
*  lowercase : boolean, default True
*  preprocessor : callable or None (default)
*  tokenizer : callable or None (default)
*  stop_words : string {‘english’}, list, or None (default)
*  ngram_range : tuple (min_n, max_n)
*  max_df : float in range [0.0, 1.0] or int, default=1.0
*  min_df : float in range [0.0, 1.0] or int, default=1
*  max_features : int or None, default=None

#### Перебор параметров

In [41]:
# lowercase - нижний регистр

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [42]:
vectorizer = TfidfVectorizer(lowercase=False)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 42307)

In [43]:
vectorizer.get_feature_names_out()[:10]

array(['00', '000', '0000', '00000', '000000', '000005102000', '000021',
       '000062David42', '0000VEC', '0001'], dtype=object)

In [44]:
# min_df, max_df - возьмем только те, которые встречаются мин/макс % случаев
vectorizer = TfidfVectorizer(min_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 9)

In [45]:
vectorizer.get_feature_names_out()

array(['and', 'from', 'in', 'lines', 'of', 'organization', 'subject',
       'the', 'to'], dtype=object)

In [46]:
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.8)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 2391)

In [47]:
# ngram_range -  будем вылавливать еще и взаимодействия n - граммы токенов от униграммы до триграмм
vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.03, max_df=0.9)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 1236)

In [48]:
# стоп-слова, preproc - можем использовать лематезатор и удаление стоп слов

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()

def preproc_nltk(text):
    #text = re.sub(f'[{string.punctuation}]', ' ', text)
    return ' '.join([wnl.lemmatize(word) for word in word_tokenize(text.lower()) if word not in stopWords])

st = "Oh, I think I ve landed Where there are miracles at work,  For the thirst and for the hunger Come the conference of birds"
preproc_nltk(st)

[nltk_data] Downloading package wordnet to /root/nltk_data...


'oh , think landed miracle work , thirst hunger come conference bird'

In [49]:
%%time

vectorizer = TfidfVectorizer(preprocessor=preproc_nltk)
vectors = vectorizer.fit_transform(newsgroups_train.data)

CPU times: user 11.6 s, sys: 68.3 ms, total: 11.6 s
Wall time: 19.4 s


In [50]:
# preproc_spacy

nlp = spacy.load("en_core_web_sm")
texts = newsgroups_train.data.copy()

def preproc_spacy(text):
    spacy_results = nlp(text)
    return ' '.join([token.lemma_ for token in spacy_results if token.lemma_ not in stopWords])
preproc_spacy(st)



'oh , I think I land miracle work ,   thirst hunger come conference bird'

In [51]:
%%time

new_texts = []
for doc in nlp.pipe(texts, batch_size=32, n_process=3, disable=["parser", "ner"]):
    new_texts.append(' '.join([tok.lemma_ for tok in doc if tok.lemma not in stopWords]))
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(new_texts)

CPU times: user 13.7 s, sys: 453 ms, total: 14.1 s
Wall time: 1min 16s


In [52]:
print(newsgroups_train.data[0])

From: rych@festival.ed.ac.uk (R Hawkes)
Subject: 3DS: Where did all the texture rules go?
Lines: 21

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych

Rycharde Hawkes				email: rych@festival.ed.ac.uk
Virtual Environment Laboratory
Dept. of Psychology			Tel  : +44 31 650 3426
Univ. of Edinburgh			Fax  : +44 31 667 0150



In [53]:
print(new_texts[0])

from : rych@festival.ed.ac.uk ( R Hawkes ) 
 subject : 3ds : where do all the texture rule go ? 
 line : 21 

 Hi , 

 I have notice that if you only save a model ( with all your mapping plane 
 position carefully ) to a .3ds file that when you reload it after restart 
 3ds , they be give a default position and orientation .   but if you save 
 to a .prj file their position / orientation be preserve .   do anyone 
 know why this information be not store in the .3ds file ?   nothing be 
 explicitly say in the manual about save texture rule in the .prj file . 
 I would like to be able to read the texture rule information , do anyone have 
 the format for the .PRJ file ? 

 be the .cel file format available from somewhere ? 

 rych 

 = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = 
 Rycharde Hawkes 				 email : rych@festival.ed.ac.uk 
 Virtual Environment Laboratory 
 Dept . of psychology 			 Tel 

#### Итоговая модель

In [54]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.5, max_features=1000)
vectors = vectorizer.fit_transform(new_texts)
vectorizer.get_feature_names_out()[::100]

array(['000', 'au', 'christ', 'engineering', 'human', 'look', 'of this',
       'report', 'technology', 'understand'], dtype=object)

#### Можем посмотреть на косинусную меру между векторами

In [55]:
vectors.shape

(2034, 1000)

In [56]:
vector = vectors.todense()[0]
(vector != 0).sum()

52

In [57]:
np.mean(list(map(lambda x: (x != 0).sum(), vectors.todense())))

89.79695181907572

In [58]:
import numpy as np
from numpy.linalg import norm

type(vectors)

In [59]:
dense_vectors = vectors.todense()
dense_vectors.shape

(2034, 1000)

In [60]:
def cosine_sim(v1, v2):
    # v1, v2 (1 x dim)
    return np.array(v1 @ v2.T / norm(v1) / norm(v2))[0][0]

In [61]:
cosine_sim(dense_vectors[0], dense_vectors[0])

1.0000000000000002

In [62]:
cosines = []
for i in range(10):
    cosines.append(cosine_sim(dense_vectors[0], dense_vectors[i]))

In [63]:
# [1, 3, 2, 0, 2, 0, 2, 1, 2, 1]
cosines

[1.0000000000000002,
 0.04191279776414236,
 0.005868383611019931,
 0.09771238093526102,
 0.07060916453270281,
 0.06745764842966309,
 0.0267141823627476,
 0.22853760897260958,
 0.031636420124663965,
 0.06928662593161493]

#### Обучим любую известную модель на полученных признаках

In [64]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test= train_test_split(dense_vectors, newsgroups_train.target, test_size=0.2, random_state=0)
y_train.shape, y_test.shape

((1627,), (407,))

In [65]:
%%time
svc = svm.SVC()
svc.fit(np.array(X_train), y_train)

CPU times: user 1.57 s, sys: 9.03 ms, total: 1.58 s
Wall time: 1.57 s


In [66]:
accuracy_score(y_test, svc.predict(np.array(X_test)))

0.9238329238329238

In [67]:
sgd = SGDClassifier()
sgd.fit(np.array(X_train), y_train)
accuracy_score(y_test, sgd.predict(np.array(X_test)))

0.9115479115479116

### Embeddings

In [68]:
import gensim.downloader as api
embeddings_pretrained = api.load('glove-twitter-25')

In [69]:
from gensim.models import Word2Vec

proc_words = [preproc_nltk(text).split() for text in newsgroups_train.data]

embeddings_trained = Word2Vec(proc_words, # data for model to train on
                 vector_size=100,                 # embedding vector size
                 min_count=3,             # consider words that occured at least 5 times
                 window=3).wv

In [70]:
def vectorize_sum(comment, embeddings):
    """
    implement a function that converts preprocessed comment to a sum of token vectors
    """
    embedding_dim = embeddings.vectors.shape[1]
    features = np.zeros([embedding_dim], dtype='float32')

    for word in preproc_nltk(comment).split():
        if word in embeddings:
            features += embeddings[f'{word}']

    return features

In [72]:
len(embeddings_trained.index_to_key)

13566

In [73]:
X_wv = np.stack([vectorize_sum(text, embeddings_pretrained) for text in newsgroups_train.data])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, newsgroups_train.target, test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((1627, 25), (407, 25))

In [74]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(max_iter=5000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

0.7051597051597052

In [75]:
X_wv = np.stack([vectorize_sum(text, embeddings_trained) for text in newsgroups_train.data])
X_train_wv, X_test_wv, y_train, y_test = train_test_split(X_wv, newsgroups_train.target, test_size=0.2, random_state=0)
X_train_wv.shape, X_test_wv.shape

((1627, 100), (407, 100))

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

clf = LogisticRegression(max_iter=10000)
wv_model = clf.fit(X_train_wv, y_train)
accuracy_score(y_test, wv_model.predict(X_test_wv))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8353808353808354