Prepare Data

In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [2]:
from sklearn.datasets import fetch_20newsgroups
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Extract data and labels
data = newsgroups.data
labels = newsgroups.target
target_names = newsgroups.target_names
df = pd.DataFrame({
    'text': data,
    'label': labels
})

print(df.head())


                                                text  label
0  \n\nI am sure some bashers of Pens fans are pr...     10
1  My brother is in the market for a high-perform...      3
2  \n\n\n\n\tFinally you said what you dream abou...     17
3  \nThink!\n\nIt's the SCSI card doing the DMA t...      3
4  1)    I have an old Jasmine drive which I cann...      4


Text Vectorization

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=420, stratify=df['label']
)
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


SVM

In [14]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_vec, y_train)

# 6. Predict and evaluate
y_pred = svm.predict(X_test_vec)

print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

SVM Accuracy: 0.7636604774535809
                          precision    recall  f1-score   support

             alt.atheism       0.65      0.63      0.64       160
           comp.graphics       0.73      0.74      0.73       195
 comp.os.ms-windows.misc       0.73      0.73      0.73       197
comp.sys.ibm.pc.hardware       0.76      0.74      0.75       196
   comp.sys.mac.hardware       0.77      0.76      0.76       193
          comp.windows.x       0.85      0.83      0.84       198
            misc.forsale       0.79      0.87      0.83       195
               rec.autos       0.55      0.84      0.66       198
         rec.motorcycles       0.84      0.76      0.80       199
      rec.sport.baseball       0.90      0.76      0.83       199
        rec.sport.hockey       0.89      0.86      0.88       200
               sci.crypt       0.81      0.77      0.79       198
         sci.electronics       0.76      0.74      0.75       197
                 sci.med       0.85      0

KNN

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vec, y_train)

y_pred = knn.predict(X_test_vec)

print("KNN Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=target_names))

KNN Accuracy: 0.1156498673740053
                          precision    recall  f1-score   support

             alt.atheism       0.08      0.24      0.12       160
           comp.graphics       0.09      0.21      0.13       195
 comp.os.ms-windows.misc       0.08      0.25      0.12       197
comp.sys.ibm.pc.hardware       0.11      0.13      0.12       196
   comp.sys.mac.hardware       0.07      0.13      0.09       193
          comp.windows.x       0.25      0.08      0.12       198
            misc.forsale       0.19      0.13      0.16       195
               rec.autos       0.03      0.05      0.04       198
         rec.motorcycles       0.13      0.08      0.10       199
      rec.sport.baseball       0.09      0.12      0.10       199
        rec.sport.hockey       0.23      0.12      0.16       200
               sci.crypt       0.17      0.10      0.12       198
         sci.electronics       0.15      0.06      0.08       197
                 sci.med       0.24      0

Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
rf = RandomForestClassifier(n_estimators=100, random_state=420)
rf.fit(X_train_vec, y_train)

# 6. Predict and evaluate
y_pred_rf = rf.predict(X_test_vec)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=target_names))

Random Forest Accuracy: 0.6424403183023872
                          precision    recall  f1-score   support

             alt.atheism       0.50      0.35      0.41       160
           comp.graphics       0.57      0.61      0.59       195
 comp.os.ms-windows.misc       0.59      0.70      0.64       197
comp.sys.ibm.pc.hardware       0.61      0.64      0.62       196
   comp.sys.mac.hardware       0.68      0.64      0.66       193
          comp.windows.x       0.78      0.72      0.75       198
            misc.forsale       0.71      0.82      0.76       195
               rec.autos       0.44      0.73      0.55       198
         rec.motorcycles       0.57      0.66      0.61       199
      rec.sport.baseball       0.64      0.72      0.68       199
        rec.sport.hockey       0.80      0.81      0.80       200
               sci.crypt       0.71      0.68      0.70       198
         sci.electronics       0.63      0.42      0.50       197
                 sci.med       0

Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

# 5. Decision Tree
dt = DecisionTreeClassifier(max_depth=20, random_state=420)
dt.fit(X_train_vec, y_train)
y_pred_dt = dt.predict(X_test_vec)

print("\n=== Decision Tree ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt, target_names=target_names))



=== Decision Tree ===
Accuracy: 0.29310344827586204
                          precision    recall  f1-score   support

             alt.atheism       0.23      0.07      0.11       160
           comp.graphics       0.69      0.17      0.27       195
 comp.os.ms-windows.misc       0.68      0.42      0.52       197
comp.sys.ibm.pc.hardware       0.52      0.15      0.24       196
   comp.sys.mac.hardware       0.70      0.40      0.51       193
          comp.windows.x       0.75      0.18      0.29       198
            misc.forsale       0.74      0.38      0.50       195
               rec.autos       0.70      0.35      0.47       198
         rec.motorcycles       0.90      0.23      0.36       199
      rec.sport.baseball       0.81      0.11      0.19       199
        rec.sport.hockey       0.61      0.53      0.57       200
               sci.crypt       0.93      0.43      0.59       198
         sci.electronics       0.06      0.01      0.01       197
                 sci.m