In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load JSON data into a DataFrame
data = pd.read_json('news.json')

# Display unique categories for debugging
print("Unique Categories in Dataset:", data['category'].unique())

# Extract features (text) and labels (categories)
X = data['title'].str.lower()  # Convert text to lowercase
y = data['category']

# Text preprocessing: Convert text to TF-IDF features with n-grams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_df=0.8, min_df=2)
X_transformed = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

# Use Grid Search to optimize SVM hyperparameters
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best SVM model
best_model = grid_search.best_estimator_
print(f"Best Parameters: {grid_search.best_params_}")

# Predict on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# Function to predict category of new instances
def predict_new_instance(new_titles):
    new_X_transformed = vectorizer.transform(new_titles)
    predictions = best_model.predict(new_X_transformed)
    return predictions

# Example new instances to classify
new_instances = [
    "New legislation passed for climate change",  # Expected: politics
    "Local team wins championship game",          # Expected: sports
    "Economic growth forecast for the next quarter",  # Expected: economics
    "Social media campaign raises awareness for mental health",  # Expected: social
]

# True labels for new instances
true_labels = ['politics', 'sports', 'economics', 'social']

# Make predictions for the new instances
predicted_categories = predict_new_instance(new_instances)

# Display predictions and calculate accuracy
for title, predicted, true in zip(new_instances, predicted_categories, true_labels):
    print(f"Title: '{title}' -> Predicted Category: '{predicted}', True Category: '{true}'")

# Calculate accuracy for new instances
new_accuracy = accuracy_score(true_labels, predicted_categories)
print("Accuracy for New Instances:", new_accuracy)


Unique Categories in Dataset: ['politics' 'sports' 'economics' 'social']
Best Parameters: {'C': 1, 'kernel': 'linear'}

Classification Report:
               precision    recall  f1-score   support

   economics       1.00      0.33      0.50         3
    politics       1.00      0.67      0.80         3
      social       0.25      1.00      0.40         2
      sports       1.00      0.25      0.40         4

    accuracy                           0.50        12
   macro avg       0.81      0.56      0.53        12
weighted avg       0.88      0.50      0.53        12

Accuracy: 0.5
Title: 'New legislation passed for climate change' -> Predicted Category: 'politics', True Category: 'politics'
Title: 'Local team wins championship game' -> Predicted Category: 'social', True Category: 'sports'
Title: 'Economic growth forecast for the next quarter' -> Predicted Category: 'economics', True Category: 'economics'
Title: 'Social media campaign raises awareness for mental health' -> Predicte