### The Stories JSON dataset is read with the encoding set to UTF-8

In [80]:
import json

# Open and read the JSON file with specified encoding
with open('stories.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Extract the topic name for each story
topic_names = []  # List to store the topic names

for story in data:
    topic = story['topic']['name']  # Access the "name" property of the "topic" object
    topic_names.append(topic)  # Store the topic name

# `topic_names` now contains the extracted topic names


In [81]:
topic_names

['People',
 'Odd Stuff',
 'Pets & Animals',
 'General Sciences',
 'Travel & Places',
 'Software',
 'Political News',
 'Odd Stuff',
 'Odd Stuff',
 'Space',
 'Space',
 'Hardware',
 'Tech Industry News',
 'Comedy',
 'Music',
 'Tech Industry News',
 'Television',
 'Microsoft',
 'PC Games',
 'Space',
 'Environment',
 'Music',
 'Tech Industry News',
 'Tech Industry News',
 'Travel & Places',
 'Political News',
 'Microsoft',
 'People',
 'People',
 'World News',
 'Arts & Culture',
 'Business & Finance',
 'World News',
 'Comics & Animation',
 'Television',
 'Basketball',
 'Movies',
 'Apple',
 'Hardware',
 'Business & Finance',
 'Odd Stuff',
 'Business & Finance',
 'Travel & Places',
 'World News',
 'Space',
 'Tech Industry News',
 'General Sciences',
 'General Sciences',
 'Political News',
 'Celebrity',
 'Health',
 'Comedy',
 'Apple',
 'Odd Stuff',
 'Autos',
 'World News',
 'People',
 'Pets & Animals',
 'Political News',
 'Health',
 'Tech Industry News',
 'Political Opinion',
 'Television',
 'P

In [82]:
from sklearn.feature_extraction.text import CountVectorizer

# Extract the story descriptions
descriptions = []  # List to store the descriptions

for story in data:
    description = story['description']
    descriptions.append(description)

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the descriptions and transform them into a bag-of-words representation
features = vectorizer.fit_transform(descriptions)

# `features` now contains the numerical representation of the story descriptions using the bag-of-words approach

# Print the feature names (words)
feature_names = vectorizer.get_feature_names()
print("Feature Names:")
print(feature_names)

# Print the feature matrix
print("Feature Matrix:")
print(features.toarray())



Feature Names:
Feature Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [83]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
label_encoder = LabelEncoder()

# Fit the label encoder on the topic names and transform them into numerical labels
encoded_labels = label_encoder.fit_transform(topic_names)

# `encoded_labels` now contains the numerical representation of the topic labels

# Print the mapping between the original labels and the encoded labels
label_mapping = {label: encoded_label for label, encoded_label in zip(topic_names, encoded_labels)}
print("Label Mapping:")
print(label_mapping)


Label Mapping:
{'People': 33, 'Odd Stuff': 29, 'Pets & Animals': 34, 'General Sciences': 17, 'Travel & Places': 47, 'Software': 42, 'Political News': 37, 'Space': 43, 'Hardware': 19, 'Tech Industry News': 44, 'Comedy': 8, 'Music': 27, 'Television': 45, 'Microsoft': 23, 'PC Games': 32, 'Environment': 12, 'World News': 48, 'Arts & Culture': 2, 'Business & Finance': 6, 'Comics & Animation': 9, 'Basketball': 5, 'Movies': 26, 'Apple': 1, 'Celebrity': 7, 'Health': 20, 'Autos': 3, 'Political Opinion': 38, 'Educational': 11, 'Security': 40, 'Gadgets': 15, 'Gaming Industry News': 16, 'Other Sports': 31, 'Design': 10, 'Linux/Unix': 22, 'Olympics': 30, 'Soccer': 41, 'Food & Drink': 14, 'Nintendo': 28, 'Xbox': 49, 'American & Canadian Football': 0, 'PlayStation': 35, 'Hockey': 21, 'Baseball': 4, 'Tennis': 46, 'Extreme': 13, 'Programming': 39, 'Playable Web Games': 36, 'Mods': 24, 'Motorsport': 25, 'Golf': 18}


In [84]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB


# Extract the story descriptions
descriptions = []
topic_names = []

for story in data:
    description = story['description']
    topic_name = story['topic']['name']
    descriptions.append(description)
    topic_names.append(topic_name)

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the descriptions and transform them into a bag-of-words representation
features = vectorizer.fit_transform(descriptions)

# Create an instance of the Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier on the features and encoded topic labels
classifier.fit(features, topic_names)

# Example: Make predictions for new data instances
new_data = [
    "Lebron",
    "Cats",
    "Messi"
]
new_features = vectorizer.transform(new_data)  # Apply the same vectorizer used for training

# Use the trained classifier for prediction
predicted_topics = classifier.predict(new_features)

# Print the predicted topics
print("Predicted Topics:")
for data, topic in zip(new_data, predicted_topics):
    print(f"Data: {data} | Predicted Topic: {topic}")


Predicted Topics:
Data: Lebron | Predicted Topic: Basketball
Data: Cats | Predicted Topic: Pets & Animals
Data: Messi | Predicted Topic: World News


In [71]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, topic_names, test_size=0.2, random_state=42)

# Create an instance of the Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier on the training set
classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Evaluation Metrics:
Accuracy: 0.4393092940578974
Precision: 0.6004689723030269
Recall: 0.4393092940578974
F1-score: 0.39898164562144206


  _warn_prf(average, modifier, msg_start, len(result))
