In [1]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [18]:
from nltk.stem import WordNetLemmatizer

# Step 1: Load the dataset
dataset_path = '20_newsgroups'
folders = [folder for folder in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, folder))]
classes = []
texts = []

# Define the classes we are interested in
target_classes = ["comp.graphics", "misc.forsale", "rec.sport.baseball", 
                  "soc.religion.christian", "talk.politics.guns"]

for i, folder in enumerate(folders):
    # If the folder is not one of the target classes, continue to the next iteration
    if folder not in target_classes:
        continue
    
    files = os.listdir(os.path.join(dataset_path, folder))
    for file in files:
        with open(os.path.join(dataset_path, folder, file), 'r') as f:
            text = f.read()
            texts.append(text)
            classes.append(folder)


In [19]:
# Step 2: Text Preprocessing
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize text into words
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [4]:
for i in range(len(texts)):
    texts[i]= preprocess_text(texts[i])

In [20]:
# Step 3: Feature Extraction based on TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, list(classes), test_size=0.2, random_state=42)



In [22]:
from sklearn.svm import SVC
svm = SVC(kernel='rbf', C=1.0)
svm.fit(X_train, y_train)


In [23]:
from sklearn.metrics import precision_score

y_pred = svm.predict(X_test)
precision = precision_score(y_test, y_pred, average='weighted')
print("Precision:", precision)

Precision: 0.9960238956564538


In [24]:
from sklearn.metrics import classification_report


y_pred = svm.predict(X_test)
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

Classification Report:
                        precision    recall  f1-score   support

         comp.graphics       0.99      1.00      1.00       213
          misc.forsale       1.00      0.99      1.00       205
    rec.sport.baseball       0.99      1.00      1.00       188
soc.religion.christian       1.00      0.99      1.00       197
    talk.politics.guns       0.99      0.99      0.99       197

              accuracy                           1.00      1000
             macro avg       1.00      1.00      1.00      1000
          weighted avg       1.00      1.00      1.00      1000

