In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_json('./News_Category_Dataset_v3.json', encoding='utf-8', lines=True)
df.isnull().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [3]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [4]:
categories = df.groupby('category')
print(f'Total Categories: {categories.ngroups}')
print(categories.size())

Total Categories: 42
category
ARTS               1509
ARTS & CULTURE     1339
BLACK VOICES       4583
BUSINESS           5992
COLLEGE            1144
COMEDY             5400
CRIME              3562
CULTURE & ARTS     1074
DIVORCE            3426
EDUCATION          1014
ENTERTAINMENT     17362
ENVIRONMENT        1444
FIFTY              1401
FOOD & DRINK       6340
GOOD NEWS          1398
GREEN              2622
HEALTHY LIVING     6694
HOME & LIVING      4320
IMPACT             3484
LATINO VOICES      1130
MEDIA              2944
MONEY              1756
PARENTING          8791
PARENTS            3955
POLITICS          35602
QUEER VOICES       6347
RELIGION           2577
SCIENCE            2206
SPORTS             5077
STYLE              2254
STYLE & BEAUTY     9814
TASTE              2096
TECH               2104
THE WORLDPOST      3664
TRAVEL             9900
U.S. NEWS          1377
WEDDINGS           3653
WEIRD NEWS         2777
WELLNESS          17945
WOMEN              3572
WORLD NEWS

In [5]:
# I decided to merge similar categories to improve accuracy
df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "ARTS" else x)
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "CULTURE & ARTS" else x)
df.category = df.category.map(lambda x: "STYLE & BEAUTY" if x == "STYLE" else x)

In [6]:
# trying to improve accuracy by combining headline and short description
df['text'] = df.headline + " " + df.short_description

In [7]:
X = df['text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
# try multiple classifiers and choose the one that works best

# Naïve Bayes:
naive_bayes_classifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', MultinomialNB()),])

# Linear SVC:
liner_svc_classifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', LinearSVC()),])

# Multi-Layer Perceptron Classifier
neural_net_classifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', MLPClassifier(alpha=1, max_iter=100, random_state=42)),])

# Decision Tree Classifier
decision_tree_classifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', DecisionTreeClassifier(max_depth=5, random_state=42)),])

# Random Forest Classifier
random_forest_classifier = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', RandomForestClassifier(max_depth=5, random_state=42, max_features=1)),])

In [9]:
naive_bayes_classifier.fit(X_train, y_train)
liner_svc_classifier.fit(X_train, y_train)
neural_net_classifier.fit(X_train, y_train)
decision_tree_classifier.fit(X_train, y_train)
random_forest_classifier.fit(X_train, y_train)

In [None]:
predictions = naive_bayes_classifier.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))

[[ 21   0   0 ...   0   0   1]
 [  0   2   0 ...   0   0   0]
 [  0   0  30 ...   0   0   1]
 ...
 [  0   0   0 ...  29   0   0]
 [  0   0   0 ...   0   1  35]
 [  0   0   0 ...   0   0 157]]
0.41370183963901425


In [None]:
predictions = liner_svc_classifier.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))

[[ 607   19   13 ...   22    5   16]
 [  18  598   14 ...   12    3    6]
 [  16    9  902 ...   28    5   36]
 ...
 [  12   20   25 ...  341    1    6]
 [   4    3   13 ...    9  342  306]
 [  16   11   39 ...   10  194 1124]]
0.6110436191137336


In [None]:
predictions = neural_net_classifier.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))

In [None]:
predictions = decision_tree_classifier.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))

In [None]:
predictions = random_forest_classifier.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))