# РК2 по ММО_Энькаэр Уэркэнь_ ИУ5И-21М

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


In [None]:
# Загрузка датасета (20 категорий новостей)
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = data.data
y = data.target

In [None]:
# Деление на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Векторизация CountVectorizer
vectorizer_count = CountVectorizer(stop_words='english', max_features=10000)
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)


In [None]:
# Векторизация TfidfVectorizer
vectorizer_tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)


In [None]:
# Классификация (SVC и LogisticRegression) для CountVectorizer
models = {
    "SVC + CountVectorizer": SVC(),
    "LogisticRegression + CountVectorizer": LogisticRegression(max_iter=1000),
    "SVC + TfidfVectorizer": SVC(),
    "LogisticRegression + TfidfVectorizer": LogisticRegression(max_iter=1000)
}

results = {}

In [None]:
# Обучение и оценка
for name, model in models.items():
    print(f"\n=== {name} ===")
    if "Count" in name:
        model.fit(X_train_count, y_train)
        preds = model.predict(X_test_count)
    else:
        model.fit(X_train_tfidf, y_train)
        preds = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, zero_division=0))
    results[name] = acc



=== SVC + CountVectorizer ===
Accuracy: 0.0894
              precision    recall  f1-score   support

           0       1.00      0.01      0.01       151
           1       0.67      0.03      0.06       202
           2       0.79      0.08      0.14       195
           3       0.42      0.03      0.05       183
           4       0.00      0.00      0.00       205
           5       0.80      0.04      0.07       215
           6       0.80      0.02      0.04       193
           7       1.00      0.01      0.02       196
           8       0.05      1.00      0.09       168
           9       1.00      0.04      0.07       211
          10       0.94      0.08      0.14       198
          11       0.82      0.04      0.08       201
          12       0.00      0.00      0.00       202
          13       1.00      0.02      0.04       194
          14       0.89      0.04      0.08       189
          15       0.63      0.18      0.28       202
          16       0.82      0.05

In [None]:

# Вывод: какая комбинация дала лучшее качество
best_model = max(results, key=results.get)
print(f"\n🏆 Лучшая модель: {best_model} с точностью {results[best_model]:.4f}")


🏆 Лучшая модель: LogisticRegression + TfidfVectorizer с точностью 0.7172


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import requests
import zipfile
import io


In [None]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
sms_data = pd.read_csv(z.open("SMSSpamCollection"), sep='\t', names=["label", "text"])

# Целевая переменная: 1 = spam, 0 = ham
sms_data['target'] = sms_data['label'].map({'ham': 0, 'spam': 1})

X = sms_data['text']
y = sms_data['target']

# Делим на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Векторизация: Count
count_vectorizer = CountVectorizer(stop_words='english')
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Векторизация: TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Классификация (Count / TFIDF x SVC / LogisticRegression)
models = {
    "SVC + CountVectorizer": SVC(),
    "LogReg + CountVectorizer": LogisticRegression(max_iter=1000),
    "SVC + TFIDFVectorizer": SVC(),
    "LogReg + TFIDFVectorizer": LogisticRegression(max_iter=1000)
}

results = {}

for name, model in models.items():
    print(f"\n=== {name} ===")
    if "Count" in name:
        model.fit(X_train_count, y_train)
        preds = model.predict(X_test_count)
    else:
        model.fit(X_train_tfidf, y_train)
        preds = model.predict(X_test_tfidf)

    acc = accuracy_score(y_test, preds)
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, zero_division=0))
    results[name] = acc

# 🏁 Лучшая комбинация
best_model = max(results, key=results.get)
print(f"\n🏆 Лучшая модель: {best_model} с точностью {results[best_model]:.4f}")



=== SVC + CountVectorizer ===
Accuracy: 0.9857
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115


=== LogReg + CountVectorizer ===
Accuracy: 0.9857
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115


=== SVC + TFIDFVectorizer ===
Accuracy: 0.9848
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.89      0.94       149

    accuracy                       