# Predicting news categories using Naive Bayes
---
In this project, our main objective is to predict news categories using only headline texts. To achieve this objective, we will use the Naive Bayes classifier.<br/>
The dataset can be found [here](https://www.kaggle.com/datasets/rmisra/news-category-dataset)<br/>
Author: [Gabriel Lins](https://github.com/gabrielblins)


In [1]:
import pandas as pd
import numpy as np
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import classification_report

In [2]:
df = pd.read_parquet('datasets/news-dataset.parquet')

In [3]:
df.columns

Index(['category', 'headline', 'authors', 'link', 'short_description', 'date'], dtype='object')

In [231]:
df.category.nunique()

41

In [230]:
df.category.unique()

array(['CRIME', 'ENTERTAINMENT', 'WORLD NEWS', 'IMPACT', 'POLITICS',
       'WEIRD NEWS', 'BLACK VOICES', 'WOMEN', 'COMEDY', 'QUEER VOICES',
       'SPORTS', 'BUSINESS', 'TRAVEL', 'MEDIA', 'TECH', 'RELIGION',
       'SCIENCE', 'LATINO VOICES', 'EDUCATION', 'COLLEGE', 'PARENTS',
       'ARTS & CULTURE', 'STYLE', 'GREEN', 'TASTE', 'HEALTHY LIVING',
       'THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST', 'FIFTY', 'ARTS',
       'WELLNESS', 'PARENTING', 'HOME & LIVING', 'STYLE & BEAUTY',
       'DIVORCE', 'WEDDINGS', 'FOOD & DRINK', 'MONEY', 'ENVIRONMENT',
       'CULTURE & ARTS'], dtype=object)

In [271]:
# Para fins didáticos, selecionei apenas 6 categorias
classes = ['COMEDY', 'SPORTS', 'BUSINESS', 'TRAVEL', 'FOOD & DRINK', 'STYLE & BEAUTY']    

In [324]:
func = lambda x: x if  x in classes else np.nan
df_news = df.loc[df['category'].apply(func).dropna().index.to_list()]
df_news = df_news[['category','headline']]
df_news.reset_index(inplace=True, drop=True)

In [325]:
df_news.category.unique()

array(['COMEDY', 'SPORTS', 'BUSINESS', 'TRAVEL', 'STYLE & BEAUTY',
       'FOOD & DRINK'], dtype=object)

In [326]:
df_news.category.value_counts()

TRAVEL            9887
STYLE & BEAUTY    9649
FOOD & DRINK      6226
BUSINESS          5937
COMEDY            5175
SPORTS            4884
Name: category, dtype: int64

In [327]:
top_cols = df_news.category.value_counts().index.to_list()

In [328]:
encoder = LabelEncoder()
df_news.loc[:,'category'] = encoder.fit_transform(df_news['category'].values)
df_news.category.value_counts()

5    9887
4    9649
2    6226
0    5937
1    5175
3    4884
Name: category, dtype: int64

In [329]:
top_labels = df_news.category.value_counts().index.to_list()

In [330]:
label_category = {}
for key,value in zip(top_labels, top_cols):
    label_category[key] = value
label_category

{5: 'TRAVEL',
 4: 'STYLE & BEAUTY',
 2: 'FOOD & DRINK',
 0: 'BUSINESS',
 1: 'COMEDY',
 3: 'SPORTS'}

In [331]:
df_news['headline'] = df_news.headline.map(
    lambda x: x.lower().translate(str.maketrans('','', string.punctuation))
)

df_news.head()

Unnamed: 0,category,headline
0,1,trumps new magathemed swimwear sinks on twitter
1,1,seth meyers has 1 funny regret after trump can...
2,1,colbert wants to turn nyc subway rides into a ...
3,1,jimmy kimmel knows why irans supreme leader wa...
4,1,late night writers breathless royal wedding re...


In [332]:
X,y = df_news['headline'], df_news['category']

In [333]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1,stratify=y,random_state = 1)

print("Training dataset: ", X_train.shape[0])
print("Test dataset: ", X_test.shape[0])

Training dataset:  37582
Test dataset:  4176


In [334]:
count_vector = CountVectorizer(stop_words = 'english')
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

In [368]:
naive_bayes = ComplementNB(alpha=.78)#
naive_bayes.fit(training_data, y_train)

ComplementNB(alpha=0.78)

In [369]:
predictions = naive_bayes.predict(testing_data)
predictions

array([5, 4, 4, ..., 4, 2, 5])

In [370]:
print(classification_report(y_test, predictions, digits=4))

              precision    recall  f1-score   support

           0     0.8564    0.7828    0.8179       594
           1     0.8239    0.7331    0.7758       517
           2     0.8544    0.8668    0.8606       623
           3     0.8124    0.9139    0.8602       488
           4     0.8878    0.9264    0.9067       965
           5     0.8751    0.8716    0.8734       989

    accuracy                         0.8587      4176
   macro avg     0.8517    0.8491    0.8491      4176
weighted avg     0.8586    0.8587    0.8577      4176



In [371]:
# Vamos verificar na prática se o nosso modelo consegue prever a categoria de notícias que ele nunca viu antes.
headline_test = ['How To Solve Every Type Of Body Odor: What Works And What Doesn’t', # Style & Beauty
                 'How You Eat A Hot Dog Can Say A Lot About Where You’re From', # Food & Drink
                 'Want Clear Skin This Summer? Make These Updates To Your Routine.', # Style & Beauty
                 'Inflation Might Be Easing, But Gear Up For What’s To Come', # Business
                 'How To Calm Anxiety During Turbulence, According To Flight Attendants', # Travel
                 'Lewandowski scores twice to take Barca top', #Sports
                 'Laughter in the dark: Behind the scenes on the UK stand-up circuit'] # Comedy
category_test = [4,2,4,0,5,3,1] #

In [372]:
headline_test = [i.lower().translate(str.maketrans('','', string.punctuation)) for i in headline_test]
test_headline = count_vector.transform(headline_test)
test_pred = naive_bayes.predict(test_headline)

In [377]:
print('True Category','Predicted Category', sep='\t\t')
for tr,pr in zip(category_test, test_pred):
    if len(label_category[tr]) < 7:
        sep = '\t\t\t'
    else:
        sep='\t\t'
    print(label_category[tr], label_category[pr], sep=sep)


True Category		Predicted Category
STYLE & BEAUTY		SPORTS
FOOD & DRINK		FOOD & DRINK
STYLE & BEAUTY		STYLE & BEAUTY
BUSINESS		BUSINESS
TRAVEL			TRAVEL
SPORTS			SPORTS
COMEDY			COMEDY
