Способ 1. На основе CountVectorizer или TfidfVectorizer.

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 加载数据集
newsgroups_data = fetch_20newsgroups(subset='all')
X, y = newsgroups_data.data, newsgroups_data.target

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

# 训练模型
clf_count = MultinomialNB()
clf_count.fit(X_train_counts, y_train)

# 预测和评估
y_pred_count = clf_count.predict(X_test_counts)
accuracy_count = accuracy_score(y_test, y_pred_count)
print(f"CountVectorizer 准确率: {accuracy_count}")

# 使用 TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

# 训练模型
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)

# 预测和评估
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"TfidfVectorizer 准确率: {accuracy_tfidf}")




CountVectorizer 准确率: 0.8511936339522547
TfidfVectorizer 准确率: 0.8474801061007957


Способ 2. На основе моделей Glove.

In [22]:
import os
import requests
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 下载并解压 GloVe 模型
def download_glove_model():
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    zip_filename = "glove.6B.zip"
    glove_filename = "glove.6B.300d.txt"
    if not os.path.exists(glove_filename):
        if not os.path.exists(zip_filename):
            print(f"Downloading GloVe model...")
            response = requests.get(url, stream=True)
            with open(zip_filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"Downloaded {zip_filename}")
        
        print(f"Extracting {zip_filename}...")
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall()
        print(f"Extracted GloVe model")

# 加载预训练的 GloVe 词向量
def load_glove_model(glove_file):
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    return glove_model

# 检查并下载 GloVe 模型
download_glove_model()

# 请确保您将路径更改为 GloVe 模型文件的实际路径
glove_model = load_glove_model('glove.6B.300d.txt')

# 其余代码与之前相同...


Downloading GloVe model...
Downloaded glove.6B.zip
Extracting glove.6B.zip...
Extracted GloVe model


In [23]:
import os
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 加载预训练的 GloVe 词向量
def load_glove_model(glove_file):
    glove_model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
    return glove_model

# 请确保你已经下载并解压了 GloVe 模型
glove_model = load_glove_model('glove.6B.300d.txt')

# 加载数据集
newsgroups_data = fetch_20newsgroups(subset='all')
X, y = newsgroups_data.data, newsgroups_data.target

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 将文本转换为 GloVe 向量
def text_to_glove_vector(text, model):
    words = text.split()
    word_vecs = [model[word] for word in words if word in model]
    if len(word_vecs) == 0:
        return np.zeros(300)
    else:
        return np.mean(word_vecs, axis=0)

# 转换训练和测试数据
X_train_vec = np.array([text_to_glove_vector(text, glove_model) for text in X_train])
X_test_vec = np.array([text_to_glove_vector(text, glove_model) for text in X_test])

# 训练模型
clf_glove = RandomForestClassifier(n_estimators=100, random_state=42)
clf_glove.fit(X_train_vec, y_train)

# 预测和评估
y_pred_glove = clf_glove.predict(X_test_vec)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print(f"GloVe 准确率: {accuracy_glove}")



GloVe 准确率: 0.5381962864721486


Сравните качество полученных моделей.

In [24]:
print(f"CountVectorizer 准确率: {accuracy_count}")
print(f"TfidfVectorizer 准确率: {accuracy_tfidf}")
print(f"GloVe 准确率: {accuracy_glove}")


CountVectorizer 准确率: 0.8511936339522547
TfidfVectorizer 准确率: 0.8474801061007957
GloVe 准确率: 0.5381962864721486
