In [None]:
# Apply support vector machine for text classification
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 1. Download NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('stopwords')

# 2. Sample data (replace with your own dataset)
documents = [
    ("This is a document about machine learning.", "technology"),
    ("Another document discussing artificial intelligence.", "technology"),
    ("This document focuses on natural language processing.", "technology"),
    ("A document related to deep learning and neural networks.", "technology"),
    ("This document explores the field of data science.", "technology"),
    ("This is a news article about the economy.", "business"),
    ("The stock market is experiencing volatility.", "business"),
    ("Financial reports indicate a recession.", "business"),
    ("The company announced a merger.", "business"),
    ("Consumer spending is on the rise.", "business"),
]

# 3. Preprocessing
stop_words = set(stopwords.words("english"))
processed_docs = [
    " ".join([word.lower() for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words])
    for doc, _ in documents
]
labels = [label for _, label in documents]

# 4. Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
features = vectorizer.fit_transform(processed_docs)

# 5. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# 6. Train the SVM model
svm_model = SVC(kernel='linear')  # You can experiment with different kernels
svm_model.fit(X_train, y_train)

# 7. Make predictions on the test set
predictions = svm_model.predict(X_test)

# 8. Evaluate the model
accuracy = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")