In [376]:
import wikipediaapi as wiki
from tqdm import tqdm
import pymorphy2
import numpy as np
import nltk
import string
from gensim.models import *
from gensim import corpora
from gensim import similarities
import re
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [377]:
wiki_wiki = wiki.Wikipedia('ru')
page_py_1 = wiki_wiki.page('Категория:Кухни народов мира')
page_py_2 = wiki_wiki.page('Категория:Кухни по странам')

In [378]:
def get_categorymembers(categorymembers, level=0, max_level=0):
        return categorymembers.values()

cuisines_1 = get_categorymembers(page_py_1.categorymembers)
cuisines_2 = get_categorymembers(page_py_2.categorymembers)

cuisines_list = []
for page in cuisines_1:
    if 'Категория:' in page.title:
        cuisines_list.append(page.title)

for page in cuisines_2:
    if 'Категория:' in page.title:
        cuisines_list.append(page.title)

In [379]:
cuisines_list = list(set(cuisines_list))

In [380]:
cuisines_list[:5]

['Категория:Гагаузская кухня',
 'Категория:Южноафриканская кухня',
 'Категория:Белорусская кухня',
 'Категория:Польская кухня',
 'Категория:Хорватская кухня']

In [381]:

f = open('cuisines.txt', 'w')

names_of_cuis = []

for cuisine in tqdm(cuisines_list):
    dishes = get_categorymembers(wiki_wiki.page(cuisine).categorymembers)
    for dish in dishes:
        names_of_cuis.append(cuisine)
        summary = re.sub('^.*? — ', '', dish.summary)
        punct = '[!"#$%&()*+,./:;<=>?@[\]^_`{|}~„“«»†*/\—–‘’]'
        nums = '[0-9]'
        summary = re.sub('\n', ' ', summary)
        summary = re.sub(punct, '', summary)
        summary = re.sub(nums, '', summary)
        summary = re.sub(r'(?![А-яё ]).', '', summary)
        try:
            f.write(summary + " ")
        except Exception:
            pass  
        f.write('\n\n\n')

  2%|█▋                                                                                | 3/149 [00:17<14:28,  5.95s/it]


KeyboardInterrupt: 

In [None]:
with open(r'labels.txt', 'w') as fp:
    for label in names_of_cuis:
        fp.write(label + "\n")

In [None]:
with open('cuisines.txt', 'r', encoding='cp1251') as f:
    text = f.read()
    
texts = text.split('\n\n\n')
texts[0][:100]

In [None]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

In [None]:
word_tokenizer = nltk.WordPunctTokenizer()

In [None]:
dates = [str(x) for x in np.arange(1900, 2022)]
def process_data(data):
    texts = []
    targets = []
    
    # поочередно проходим по всем новостям в списке
    for item in data:   
        tokens = word_tokenizer.tokenize(item)
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words and word not in dates)]
        texts.append(tokens) # добавляем в предобработанный список
    
    return texts

In [None]:
texts = process_data(texts)

In [None]:
morph = pymorphy2.MorphAnalyzer()
tokenized_texts = []

In [None]:
for i in tqdm(range(len(texts))):
    text_lemmatized = [morph.parse(x)[0].normal_form for x in texts[i]]
    tokenized_texts.append(text_lemmatized)

In [None]:
tokenized_texts[0][:10]

In [None]:
with open('rus_stopwords.txt', 'r', encoding='utf-8') as f:
    sw = f.read().split('\n')

In [None]:
for ind, text in enumerate(tokenized_texts):
    text = [w for w in text if w not in sw]
    tokenized_texts[ind] = text

In [None]:
with open(r'text_lemmatized.txt', 'w') as fp:
    for item in tokenized_texts:
        fp.write(item + "\n")

In [None]:
print('Making dictionary...')
dictionary = corpora.Dictionary(tokenized_texts)
print('Original: {}'.format(dictionary))
dictionary.filter_extremes(no_below = 5, no_above = 0.9, keep_n=None)
dictionary.save('polkrug.dict')
print('Filtered: {}'.format(dictionary))

print('Vectorizing corpus...')
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
corpora.MmCorpus.serialize('polkrug.model', corpus) 

In [None]:
len(tokenized_texts), len(corpus)

In [None]:
texts = open('text_lemmatized.txt', encoding = 'cp1251').readlines()
labels = open('labels.txt', encoding = 'cp1251').readlines()

In [None]:
labels[0]

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le = preprocessing.LabelEncoder()
encoded_labels = le.fit_transform(labels)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(texts[:-1], encoded_labels, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features = 200, norm = None) # возмем топ 200 слов
vectorizer.fit(train_texts)
vectorizer.get_feature_names()[:10]

In [None]:
X_train = vectorizer.fit_transform(X_train)
X_test  = vectorizer.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 500, max_depth = 10)
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)

### Построим матрицу неточностей

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
fig, ax = plt.subplots(figsize=(30, 30))
plot_confusion_matrix(clf, X_test, y_test, ax=ax, include_values=False)
plt.show()

In [None]:
list(le.inverse_transform([147]))

Среди всех выделяется Американская кухня на кол-во ложных классификаций. Возможно потому, что Америка многонациональна.