In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the dataset
categories = None  # Use all categories, or specify a subset, e.g., ['rec.autos', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

# Step 2: Convert text to numerical features using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Adjust max_features for better performance

# Step 3: Define and train RandomForestClassifier
clf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)  # Using 200 trees for better performance

# Step 4: Create a pipeline for convenience
pipeline = Pipeline([
    ('tfidf', vectorizer),
    ('clf', clf)
])

pipeline.fit(newsgroups_train.data, newsgroups_train.target)

# Step 5: Evaluate on test data
y_pred = pipeline.predict(newsgroups_test.data)
accuracy = accuracy_score(newsgroups_test.target, y_pred)

# Print evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(newsgroups_test.target, y_pred, target_names=newsgroups_test.target_names))


Accuracy: 0.5905
                          precision    recall  f1-score   support

             alt.atheism       0.41      0.35      0.38       319
           comp.graphics       0.57      0.57      0.57       389
 comp.os.ms-windows.misc       0.56      0.61      0.58       394
comp.sys.ibm.pc.hardware       0.58      0.54      0.56       392
   comp.sys.mac.hardware       0.62      0.61      0.62       385
          comp.windows.x       0.66      0.62      0.64       395
            misc.forsale       0.68      0.70      0.69       390
               rec.autos       0.41      0.66      0.50       396
         rec.motorcycles       0.68      0.63      0.66       398
      rec.sport.baseball       0.68      0.72      0.70       397
        rec.sport.hockey       0.79      0.80      0.79       399
               sci.crypt       0.78      0.61      0.68       396
         sci.electronics       0.42      0.43      0.42       393
                 sci.med       0.62      0.62      0.62   