In [27]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range=(1,2))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 9,
 'hathodawala': 2,
 'is': 4,
 'looking': 7,
 'for': 0,
 'job': 6,
 'thor hathodawala': 10,
 'hathodawala is': 3,
 'is looking': 5,
 'looking for': 8,
 'for job': 1}

In [21]:
corpus = [
    "Thor ate pizza",
    "Loki is tall", 
    "Loki is eating pizza"
]

In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return " ".join(filtered_tokens)

In [16]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [18]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [29]:
v.transform(["Thor eat pizza"]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

In [30]:
v.transform(["Hulk eat pizza"]).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [34]:
import pandas as pd
import numpy as np

In [37]:
df = pd.read_json("News_Category_Dataset_v3.json", lines=True)

In [38]:
df.shape

(209527, 6)

In [39]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22


In [41]:
df["short_description"][:3]


0    Health experts said it is too early to predict...
1    He was subdued by passengers and crew when he ...
2    "Until you have a dog you don't understand wha...
Name: short_description, dtype: object

In [43]:
df.category.value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

In [52]:
min_samples = 2000

df_business = df[df['category'] == 'BUSINESS'].sample(min_samples, random_state = 42)
df_crime = df[df['category'] == 'CRIME'].sample(min_samples, random_state = 42)
df_sports = df[df['category'] == 'SPORTS'].sample(min_samples, random_state = 42)
df_science = df[df['category'] == 'SCIENCE'].sample(min_samples, random_state = 42)

In [54]:
df_balanced = pd.concat([df_business, df_crime, df_sports, df_science], axis = 0)
df_balanced.category.value_counts()

category
BUSINESS    2000
CRIME       2000
SPORTS      2000
SCIENCE     2000
Name: count, dtype: int64

In [55]:
df_balanced['category_num'] = df_balanced['category'].map({
    'BUSINESS': 0,
    'SPORTS': 1, 
    'CRIME': 2, 
    'SCIENCE': 3
})

In [57]:
df_balanced[2003: 2005]

Unnamed: 0,link,headline,category,short_description,authors,date,category_num
72193,https://www.huffingtonpost.com/entry/waldo-sou...,'Very Disorderly' Waldo Makes It Easy For Poli...,CRIME,He should have stuck to blending in.,Lee Moran,2016-03-21,2
123271,https://www.huffingtonpost.com/entry/grand-cen...,3 Seriously Injured In Grand Central Station S...,CRIME,,Ryan Grenoble,2014-08-16,2


In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced['short_description'], df_balanced['category_num'], test_size = 0.2, random_state = 42 )

In [61]:
X_train.shape, X_test.shape

((6400,), (1600,))

In [65]:
# Attempt 1 : Use 1-gram which is nothing but a Bag Of Words (BOW) model
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer()), 
    ('MultiNB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.78      0.68       425
           1       0.82      0.52      0.64       397
           2       0.54      0.78      0.64       379
           3       0.77      0.51      0.61       399

    accuracy                           0.65      1600
   macro avg       0.68      0.65      0.64      1600
weighted avg       0.69      0.65      0.64      1600



In [68]:
# Attempt 1 : Use 1-gram and bigram model
clf = Pipeline([
    ('vectorizer_1_2_gram', CountVectorizer(ngram_range=(1,2))), 
    ('MultiNB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.80      0.66       425
           1       0.84      0.46      0.60       397
           2       0.55      0.77      0.64       379
           3       0.77      0.46      0.58       399

    accuracy                           0.62      1600
   macro avg       0.68      0.62      0.62      1600
weighted avg       0.68      0.62      0.62      1600



In [71]:
# Use text pre-processing to remove stop words, punctuations and apply lemmatization

df_balanced['preprocessed_text'] = df_balanced['short_description'].apply(preprocess)

In [72]:
df_balanced.head()


Unnamed: 0,link,headline,category,short_description,authors,date,category_num,preprocessed_text
101848,https://www.huffingtonpost.com/entry/how-to-ma...,How to Manage Your Personal Brand,BUSINESS,Make no mistake: If you have a Facebook accoun...,"Kevin O'Leary, Contributor",2015-04-20,0,mistake Facebook account Instagram page Twitte...
93655,https://www.huffingtonpost.com/entry/uber-ad-n...,It Looks Like Uber's Winning Its War With New ...,BUSINESS,Grab the popcorn.,Alexander C. Kaufman,2015-07-22,0,grab popcorn
103195,https://www.huffingtonpost.com/entry/the-progr...,The Progressive Promise of Today's Technology,BUSINESS,"A digital policy for the new century, tailored...","Andrei Cherny, ContributorCEO, Aspiration.com",2015-04-04,0,digital policy new century tailor moment futur...
103321,https://www.huffingtonpost.com/entry/dont-let-...,Don't Let These 5 Confusing Words Mar Your Image,BUSINESS,"Tom's an articulate physician, totally able to...","Dianna Booher, ContributorCEO, BooherResearch....",2015-04-02,0,Tom articulate physician totally able speak mi...
96214,https://www.huffingtonpost.com/entry/what-you-...,What You Don't Know About Overnight Success,BUSINESS,"I've been fighting this thing for 32 years. ""O...","Grant Cardone, ContributorNew York Times bests...",2015-06-23,0,fight thing 32 year overnight success happen o...


In [73]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['preprocessed_text'], df_balanced['category_num'], test_size = 0.2, random_state = 42 )

In [74]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range = (1, 2))),        #using the ngram_range parameter 
    ('Multi NB', MultinomialNB())
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.78      0.70       425
           1       0.84      0.52      0.64       397
           2       0.53      0.81      0.64       379
           3       0.79      0.50      0.61       399

    accuracy                           0.65      1600
   macro avg       0.70      0.65      0.65      1600
weighted avg       0.70      0.65      0.65      1600

