In [33]:
import wikipediaapi as wiki
from tqdm import tqdm_notebook, tqdm
import pymorphy2
import numpy as np
import nltk
import string

In [34]:
wiki_wiki = wiki.Wikipedia('ru')
page_py_1 = wiki_wiki.page('Категория:Кухни народов мира')
page_py_2 = wiki_wiki.page('Категория:Кухни по странам')

In [35]:
def get_categorymembers(categorymembers, level=0, max_level=0):
        return categorymembers.values()

cuisines_1 = get_categorymembers(page_py_1.categorymembers)
cuisines_2 = get_categorymembers(page_py_2.categorymembers)

cuisines_list = []
for page in cuisines_1:
    if 'Категория:' in page.title:
        cuisines_list.append(page.title)

for page in cuisines_2:
    if 'Категория:' in page.title:
        cuisines_list.append(page.title)

In [36]:
cuisines_list = list(set(cuisines_list))

In [37]:
cuisines_list[:5]

['Категория:Татарская кухня',
 'Категория:Тунисская кухня',
 'Категория:Английская кухня',
 'Категория:Аргентинская кухня',
 'Категория:Хорватская кухня']

In [38]:
f.close()

In [39]:
import re

f = open('cuisines.txt', 'w')

names_of_cuis = []

for cuisine in tqdm(cuisines_list):
    dishes = get_categorymembers(wiki_wiki.page(cuisine).categorymembers)
    for dish in dishes:
        names_of_cuis.append(cuisine)
        summary = re.sub('^.*? — ', '', dish.summary)
        punct = '[!"#$%&()*+,./:;<=>?@[\]^_`{|}~„“«»†*/\—–‘’]'
        nums = '[0-9]'
        summary = re.sub('\n', ' ', summary)
        summary = re.sub(punct, '', summary)
        summary = re.sub(nums, '', summary)
        summary = re.sub(r'(?![А-яё ]).', '', summary)
        try:
            f.write(summary + " ")
        except Exception:
            pass  
        f.write('\n\n\n')

100%|██████████| 149/149 [17:38<00:00,  7.10s/it]


In [58]:
with open(r'labels.txt', 'w') as fp:
    for label in names_of_cuis:
        fp.write(label + "\n")

In [40]:
with open('cuisines.txt', 'r', encoding='cp1251') as f:
    text = f.read()
    
texts = text.split('\n\n\n')
texts[0][:100]

'Кулинарное искусство татарского народа богато своими национальными и культурными традициями уходящим'

In [41]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yaroslav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
word_tokenizer = nltk.WordPunctTokenizer()

In [43]:
dates = [str(x) for x in np.arange(1900, 2022)]
def process_data(data):
    texts = []
    targets = []
    
    # поочередно проходим по всем новостям в списке
    for item in data:   
        tokens = word_tokenizer.tokenize(item)
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words and word not in dates)]
        texts.append(tokens) # добавляем в предобработанный список
    
    return texts

In [44]:
texts = process_data(texts)

In [45]:
morph = pymorphy2.MorphAnalyzer()
tokenized_texts = []

In [46]:
for i in tqdm_notebook(range(len(texts))):
    text_lemmatized = [morph.parse(x)[0].normal_form for x in texts[i]]
    tokenized_texts.append(text_lemmatized)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(len(texts))):


  0%|          | 0/3633 [00:00<?, ?it/s]

In [47]:
tokenized_texts[0][:10]

['кулинарный',
 'искусство',
 'татарский',
 'народ',
 'богато',
 'свой',
 'национальный',
 'культурный',
 'традиция',
 'уходить']

In [48]:
with open('rus_stopwords.txt', 'r', encoding='utf-8') as f:
    sw = f.read().split('\n')

In [49]:
for ind, text in enumerate(tokenized_texts):
    text = [w for w in text if w not in sw]
    tokenized_texts[ind] = text

In [50]:
with open(r'text_lemmatized.txt', 'w') as fp:
    for item in tokenized_texts:
        fp.write("%s\n\n\n" % item)

In [51]:
!pip install gensim



In [52]:
from gensim.models import *
from gensim import corpora
from gensim import similarities

In [53]:
print('Making dictionary...')
dictionary = corpora.Dictionary(tokenized_texts)
print('Original: {}'.format(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.9, keep_n=None)
dictionary.save('polkrug.dict')
print('Filtered: {}'.format(dictionary))

print('Vectorizing corpus...')
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
corpora.MmCorpus.serialize('polkrug.model', corpus) 

Making dictionary...
Original: Dictionary<15918 unique tokens: ['богато', 'век', 'глубь', 'день', 'жизнь']...>
Filtered: Dictionary<4540 unique tokens: ['век', 'день', 'жизнь', 'искусство', 'история']...>
Vectorizing corpus...


In [54]:
len(tokenized_texts), len(corpus)

(3633, 3633)

In [56]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 2),
 (10, 2),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1)]