In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the 20 Newsgroups dataset
df= fetch_20newsgroups(subset='all')
X = df.data
y = df.target
labels = df.target_names

# Step 2: Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3A: Create pipeline for LinearSVC
svm = make_pipeline(TfidfVectorizer(stop_words='english', max_features=20000), LinearSVC())
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Step 3B: Create pipeline for Logistic Regression
model = make_pipeline(TfidfVectorizer(stop_words='english', max_features=20000), LogisticRegression())
model.fit(X_train, y_train)
y_pred_lr = model.predict(X_test)

# Step 4: Evaluation - SVM
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm, target_names=labels))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))




SVM Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.93      0.93      0.93       151
           comp.graphics       0.82      0.88      0.85       202
 comp.os.ms-windows.misc       0.88      0.86      0.87       195
comp.sys.ibm.pc.hardware       0.74      0.79      0.76       183
   comp.sys.mac.hardware       0.88      0.89      0.89       205
          comp.windows.x       0.90      0.88      0.89       215
            misc.forsale       0.89      0.84      0.86       193
               rec.autos       0.92      0.94      0.93       196
         rec.motorcycles       0.97      0.95      0.96       168
      rec.sport.baseball       0.97      0.99      0.98       211
        rec.sport.hockey       0.97      0.97      0.97       198
               sci.crypt       0.98      0.98      0.98       201
         sci.electronics       0.93      0.85      0.89       202
                 sci.med       0.94      0.96  