In [1]:
import gzip
import re
import numpy as np

from gensim.models import Word2Vec

from dataclasses import dataclass
from typing import Iterator, List

from nltk.corpus import stopwords
from yargy.tokenizer import MorphTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Загрузка и чтение файлов

@dataclass
class Text:
    label: str
    title: str
    text: str

# Чтение файла
def read_doc(fn: str="C:/Users/MSI-PC/Desktop/news.txt.gz") -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

# Токенизация текста
def tok_doc(doc):
    return re.split(r'\s+', doc, re.U)
            
# Токенизация текста без нормализации             
def tokenize_text(text: str) -> List[str]:
    text = text.lower()

    words = re.findall(r'\b\w+\b', text) 

    return words

# Текст без знаков припенания
def read_proc_text(text: str) -> str:
    return ' '.join(tokenize_text(text))

In [2]:
# Токенизация c нормализацей    
 
stop_words = stopwords.words('russian')
stop_words.extend(['это','свои'])
ru_stopwords = set(stop_words)

tokenizer = MorphTokenizer()  
    
def tokenize_norm_text(text: str) -> List[str]:
    text2=' '.join([w.normalized for w in tokenizer(text) if w.normalized not in ru_stopwords])     
    words = re.findall(r'\b\w+\b', text2) 
    return words

# Чтение нормализованного текста
def read_norm_text(text: str) -> str:
    return ' '.join(tokenize_norm_text(text))

In [3]:
# Обучение word2vec без нормализации 

sentences = [tokenize_text(text.text) for text in read_doc()]

# Обучаем модель
w2v = Word2Vec(sentences)

# Сохраняем модель
w2v.wv.save_word2vec_format('word2vec_bez_norm.bin')


In [4]:
# Обучение word2vec с нормализацией
sentences2 = [tokenize_norm_text(text.text) for text in read_doc()]

# Обучаем w2v
w2v2 = Word2Vec(sentences2)

# Cохраняем модель
w2v2.wv.save_word2vec_format('word2vec_s_norm.bin')


In [6]:
# Перевести каждый документ в вектор
def doc_to_vec(doc, model):

    vecs = [model.wv[word] for word in tok_doc(doc) if word in model.wv]

    # Усредняем
    if vecs:
        return np.mean(vecs, axis=0)
    else:
        print("Vector 0")
        return np.zeros(model.vector_size)


# Загружаем текст 
docs=[read_proc_text(text.text) for text in read_doc()]
labels = [text.label for text in read_doc()]


# Преобразование в единный массив для обучения
def get_array(X_avg):
    return np.array([np.array(avg_vec) for avg_vec in X_avg])

X = get_array([doc_to_vec(doc, w2v) for doc in docs])


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


svc = SVC()
svc.fit(X_train, y_train)

# Классификация
y_pred = svc.predict(X_test)


print("\nРезультаты:\n", classification_report(y_test, y_pred))
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Результаты:
               precision    recall  f1-score   support

    business       0.75      0.04      0.07        79
     culture       0.60      0.57      0.59       279
   economics       0.55      0.82      0.66       266
      forces       0.54      0.49      0.52       149
        life       0.56      0.69      0.62       288
       media       0.57      0.59      0.58       299
     science       0.67      0.59      0.62       288
       sport       0.84      0.84      0.84       276
       style       0.50      0.16      0.24        38
      travel       0.00      0.00      0.00        38

    accuracy                           0.62      2000
   macro avg       0.56      0.48      0.47      2000
weighted avg       0.61      0.62      0.60      2000


Accuracy: 0.616


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
X.shape

In [8]:
texts = list(read_doc("C:/Users/MSI-PC/Desktop/news.txt.gz"))

# Преобразование в вектор
vectorizer = TfidfVectorizer(max_df=0.2, min_df=10)
X2 = vectorizer.fit_transform([read_norm_text(text.text) for text in texts]).toarray()
X2.shape

(10000, 10738)

In [9]:
# Уменьшение размерности
from sklearn.decomposition import PCA

pca = PCA(n_components=500)
X_pca = pca.fit_transform(X2)
X_pca.shape

(10000, 500)

In [10]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_pca, labels, test_size=0.2, random_state=42)

In [11]:
from sklearn.ensemble import RandomForestClassifier

# Обучение
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train2, y_train2)

y_pred2 = rf_classifier.predict(X_test2)

print("\nРезультат:\n", classification_report(y_test2, y_pred2))
print("\nAccuracy:\n", accuracy_score(y_test2, y_pred2))


Результат:
               precision    recall  f1-score   support

    business       0.86      0.24      0.38        79
     culture       0.88      0.89      0.88       279
   economics       0.74      0.95      0.83       266
      forces       0.76      0.85      0.80       149
        life       0.79      0.81      0.80       288
       media       0.84      0.81      0.82       299
     science       0.88      0.82      0.85       288
       sport       0.94      0.99      0.97       276
       style       1.00      0.63      0.77        38
      travel       0.84      0.42      0.56        38

    accuracy                           0.84      2000
   macro avg       0.85      0.74      0.77      2000
weighted avg       0.84      0.84      0.83      2000


Accuracy:
 0.8365


In [12]:

svc2 = SVC()
svc2.fit(X_train2, y_train2)


y_pred3 = svc2.predict(X_test2)


print("\nРезультаты:\n", classification_report(y_test2, y_pred3))
print("\nAccuracy:\n", accuracy_score(y_test2, y_pred3))


Результаты:
               precision    recall  f1-score   support

    business       0.86      0.38      0.53        79
     culture       0.90      0.92      0.91       279
   economics       0.81      0.95      0.87       266
      forces       0.79      0.85      0.82       149
        life       0.83      0.84      0.83       288
       media       0.89      0.80      0.84       299
     science       0.88      0.87      0.88       288
       sport       0.95      0.99      0.97       276
       style       0.94      0.84      0.89        38
      travel       0.74      0.74      0.74        38

    accuracy                           0.87      2000
   macro avg       0.86      0.82      0.83      2000
weighted avg       0.87      0.87      0.86      2000


Accuracy:
 0.8665


In [27]:
# Преобразование новых текстов
def transform_text(text: str) -> np.ndarray:
    normalized_text = read_norm_text(text)
    vect = vectorizer.transform([normalized_text]).toarray()
    return pca.transform(vect)

transform_text("собака")

array([[-2.14930029e-02, -3.33440327e-02, -2.54877234e-02,
        -5.96236025e-03,  1.90019790e-03, -4.49439045e-02,
         1.70601793e-02, -2.10121223e-03,  1.29641664e-03,
        -5.59656356e-03,  2.56318198e-02, -3.69692525e-03,
         8.44701757e-03, -5.16915854e-03, -2.86081828e-02,
         4.78661221e-03, -1.53442474e-02,  6.11820415e-03,
         2.71343761e-02,  3.15846464e-03,  2.90577111e-02,
        -5.00040757e-03,  1.47472006e-02, -3.34472032e-04,
        -4.95930401e-03,  1.88331740e-02,  1.89620442e-02,
         3.96031949e-03, -1.12607836e-02, -3.30873354e-03,
        -1.52860389e-02, -1.67796527e-02, -6.11132308e-03,
        -2.78674255e-03, -4.95226429e-05, -1.91568297e-02,
        -3.64226467e-02,  3.64859750e-02,  6.75511789e-03,
         4.53389436e-02, -1.32193676e-02,  2.45558788e-02,
         2.36681689e-02, -1.40611714e-02,  3.24405146e-02,
         5.39078768e-02,  3.67467681e-02,  2.60071547e-02,
         2.11100315e-02,  2.36323025e-03, -3.00558768e-0

In [26]:
svc2.predict(transform_text("собака"))

array(['life'], dtype='<U9')

In [23]:
svc2.predict(transform_text("рубль"))

array(['economics'], dtype='<U9')