<a href="https://colab.research.google.com/github/daradanci/MMO_2025/blob/main/notes/LR6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.datasets import fetch_20newsgroups

# Загружаем данные (только тренировочную часть)
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
texts = data.data  # Список текстов
labels = data.target  # Метки категорий (0-19)

print(f"Пример текста:\n{texts[0][:200]}...")
print(f"\nМетка категории: {labels[0]} ({data.target_names[labels[0]]})")

Пример текста:
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were ...

Метка категории: 7 (rec.autos)


In [None]:
!pip install --upgrade numpy gensim

Collecting numpy
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m


In [None]:
import numpy, gensim
print(f"NumPy: {numpy.__version__}")
print(f"Gensim: {gensim.__version__}")

NumPy: 1.26.4
Gensim: 4.3.3


word2vec

In [None]:
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    return [word for word in simple_preprocess(text) if word not in stop_words]

tokenized_texts = [preprocess(text) for text in texts]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Убедитесь, что предобработка выполнена:
from gensim.utils import simple_preprocess
tokenized_texts = [simple_preprocess(text) for text in texts]

# Обучение модели
from gensim.models import Word2Vec

model_w2v = Word2Vec(
    sentences=tokenized_texts,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    epochs=10,
    sg=1  # Используем skip-gram
)

# Проверка
print(model_w2v.wv.most_similar("computer", topn=3))

[('aided', 0.62911456823349), ('shopper', 0.6050333380699158), ('isdn', 0.5961162447929382)]


In [None]:
import numpy as np

def text_to_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_w2v = np.array([text_to_vector(tokens, model_w2v) for tokens in tokenized_texts])
y = labels

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_w2v, y, test_size=0.3, random_state=42)

clf_w2v = RandomForestClassifier(n_estimators=100)
clf_w2v.fit(X_train, y_train)

y_pred = clf_w2v.predict(X_test)
print("Word2Vec + RandomForest:")
print(classification_report(y_test, y_pred, target_names=data.target_names))

Word2Vec + RandomForest:
                          precision    recall  f1-score   support

             alt.atheism       0.44      0.55      0.49       135
           comp.graphics       0.46      0.52      0.49       166
 comp.os.ms-windows.misc       0.55      0.49      0.52       170
comp.sys.ibm.pc.hardware       0.51      0.50      0.50       182
   comp.sys.mac.hardware       0.50      0.39      0.44       183
          comp.windows.x       0.67      0.75      0.71       169
            misc.forsale       0.63      0.70      0.67       172
               rec.autos       0.36      0.50      0.42       191
         rec.motorcycles       0.51      0.51      0.51       198
      rec.sport.baseball       0.60      0.73      0.66       168
        rec.sport.hockey       0.76      0.64      0.70       163
               sci.crypt       0.82      0.67      0.74       195
         sci.electronics       0.56      0.50      0.53       177
                 sci.med       0.71      0.73     

TfidfVectorizer


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform([" ".join(tokens) for tokens in tokenized_texts])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

clf_tfidf = RandomForestClassifier(n_estimators=100)
clf_tfidf.fit(X_train, y_train)

y_pred = clf_tfidf.predict(X_test)
print("\nTF-IDF + RandomForest:")
print(classification_report(y_test, y_pred, target_names=data.target_names))


TF-IDF + RandomForest:
                          precision    recall  f1-score   support

             alt.atheism       0.51      0.47      0.49       135
           comp.graphics       0.55      0.51      0.53       166
 comp.os.ms-windows.misc       0.62      0.65      0.63       170
comp.sys.ibm.pc.hardware       0.59      0.60      0.60       182
   comp.sys.mac.hardware       0.68      0.62      0.65       183
          comp.windows.x       0.64      0.79      0.71       169
            misc.forsale       0.69      0.63      0.66       172
               rec.autos       0.45      0.72      0.56       191
         rec.motorcycles       0.74      0.63      0.68       198
      rec.sport.baseball       0.64      0.67      0.66       168
        rec.sport.hockey       0.73      0.73      0.73       163
               sci.crypt       0.83      0.70      0.76       195
         sci.electronics       0.55      0.54      0.54       177
                 sci.med       0.67      0.72      

In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━