<a href="https://colab.research.google.com/github/farzan2404/Text_classification/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med','comp.sys.mac.hardware',
    'comp.windows.x',
    'misc.forsale',
    'rec.autos',
    'rec.motorcycles',
    'rec.sport.baseball',
    'rec.sport.hockey',
    'sci.crypt',
    'sci.electronics',
    'sci.space',
    'talk.politics.guns',
    'talk.politics.mideast',
    'talk.politics.misc',
    'talk.religion.misc',
    'comp.sys.ibm.pc.hardware',
    'comp.os.ms-windows.misc']

# Load the training data
news_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# Load the test data
news_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

text_clf = Pipeline([('vect', TfidfVectorizer()),
                      ('clf', MultinomialNB()) ])

# train the model
text_clf.fit(news_train.data, news_train.target)
# Predict the test cases
predicted = text_clf.predict(news_test.data)

from sklearn import metrics
from sklearn.metrics import accuracy_score
import numpy as np

print('Accuracy achieved is ' + str(np.mean(predicted == news_test.target)))
print(metrics.classification_report(news_test.target, predicted, target_names=news_test.target_names)),
metrics.confusion_matrix(news_test.target, predicted)

Accuracy achieved is 0.7738980350504514
                          precision    recall  f1-score   support

             alt.atheism       0.80      0.52      0.63       319
           comp.graphics       0.81      0.65      0.72       389
 comp.os.ms-windows.misc       0.82      0.65      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       392
   comp.sys.mac.hardware       0.86      0.77      0.81       385
          comp.windows.x       0.89      0.75      0.82       395
            misc.forsale       0.93      0.69      0.80       390
               rec.autos       0.85      0.92      0.88       396
         rec.motorcycles       0.94      0.93      0.93       398
      rec.sport.baseball       0.92      0.90      0.91       397
        rec.sport.hockey       0.89      0.97      0.93       399
               sci.crypt       0.59      0.97      0.74       396
         sci.electronics       0.84      0.60      0.70       393
                 sci.med       0.92

array([[166,   0,   0,   1,   0,   1,   0,   0,   1,   1,   1,   3,   0,
          6,   3, 123,   4,   8,   0,   1],
       [  1, 252,  15,  12,   9,  18,   1,   2,   1,   5,   2,  41,   4,
          0,   6,  15,   4,   1,   0,   0],
       [  0,  14, 258,  45,   3,   9,   0,   2,   1,   3,   2,  25,   1,
          0,   6,  23,   2,   0,   0,   0],
       [  0,   5,  11, 305,  17,   1,   3,   6,   1,   0,   2,  19,  13,
          0,   5,   3,   1,   0,   0,   0],
       [  0,   3,   8,  23, 298,   0,   3,   8,   1,   3,   1,  16,   8,
          0,   2,   8,   3,   0,   0,   0],
       [  1,  21,  17,  13,   2, 298,   1,   0,   1,   1,   0,  23,   0,
          1,   4,  10,   2,   0,   0,   0],
       [  0,   1,   3,  31,  12,   1, 271,  19,   4,   4,   6,   5,  12,
          6,   3,   9,   3,   0,   0,   0],
       [  0,   1,   0,   3,   0,   0,   4, 364,   3,   2,   2,   4,   1,
          1,   3,   3,   4,   0,   1,   0],
       [  0,   0,   0,   1,   0,   0,   2,  10, 371,   0,   0,  

In [6]:
# Additional test data
additional_test_data = [
    "The new Apple Macbook Pro is a powerful machine for graphics design.",
    "Soccer is a popular sport in many countries.",
    "I'm selling my old computer monitor online.",
    "I'm considering buying a new motorcycle.",
    "The recent space exploration mission was a significant achievement.",
    "I'm interested in learning more about computer security.",
    "Electric cars are becoming increasingly popular.",
    "The political situation in the Middle East is complex and sensitive.",
    "I want to sell my collection of vintage vinyl records.",
    "I enjoy watching baseball games in my free time."
]

# Use the trained model to predict the categories for the additional test data
additional_predicted = text_clf.predict(additional_test_data)

# Map the predicted category IDs to category names
additional_predicted_category_names = [news_test.target_names[i] for i in additional_predicted]

# Print the predictions for the additional test data
for data, category in zip(additional_test_data, additional_predicted_category_names):
    print(f"Text: {data}")
    print(f"Predicted Category: {category}\n")


Text: The new Apple Macbook Pro is a powerful machine for graphics design.
Predicted Category: comp.sys.mac.hardware

Text: Soccer is a popular sport in many countries.
Predicted Category: rec.sport.hockey

Text: I'm selling my old computer monitor online.
Predicted Category: misc.forsale

Text: I'm considering buying a new motorcycle.
Predicted Category: rec.motorcycles

Text: The recent space exploration mission was a significant achievement.
Predicted Category: sci.space

Text: I'm interested in learning more about computer security.
Predicted Category: sci.crypt

Text: Electric cars are becoming increasingly popular.
Predicted Category: rec.autos

Text: The political situation in the Middle East is complex and sensitive.
Predicted Category: talk.politics.mideast

Text: I want to sell my collection of vintage vinyl records.
Predicted Category: misc.forsale

Text: I enjoy watching baseball games in my free time.
Predicted Category: rec.sport.baseball

