In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
v=CountVectorizer(ngram_range=(1,3))
v.fit(["Thor Hathodwala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodwala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodwala': 13,
 'hathodwala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodwala is': 14,
 'hathodwala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [4]:
corpus={
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
}

In [5]:
import spacy
nlp=spacy.load("en_core_web_sm")
def preprocess(text):
    doc=nlp(text)
    filtered_token=[]
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_token.append(token.lemma_)
    return " ".join(filtered_token)

In [6]:
corpus_processed=[preprocess(text) for text in corpus]

In [7]:
corpus_processed

['Thor eat pizza', 'Loki eat pizza', 'Loki tall']

In [8]:
v.fit(corpus_processed) 
v.vocabulary_

{'thor': 8,
 'eat': 0,
 'pizza': 6,
 'thor eat': 9,
 'eat pizza': 1,
 'thor eat pizza': 10,
 'loki': 2,
 'loki eat': 3,
 'loki eat pizza': 4,
 'tall': 7,
 'loki tall': 5}

In [12]:
df=pd.read_json("news_dataset.json")

In [13]:
df

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS


In [16]:
df["category"].value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [20]:
df["category"].isnull().sum()

0

In [46]:
import spacy
nlp=spacy.load("en_core_web_sm")

In [47]:
min_sample=1381
df_BUSINESS=df[df.category=="BUSINESS"].sample(min_sample,random_state=2022)
df_SPORTS=df[df.category=="SPORTS"].sample(min_sample,random_state=2022)
df_CRIME=df[df.category=="CRIME"].sample(min_sample,random_state=2022)
df_SCIENCE=df[df.category=="SCIENCE"].sample(min_sample,random_state=2022)

In [48]:
FIN_DATA=pd.concat([df_BUSINESS,df_SPORTS,df_CRIME,df_SCIENCE],axis=0)

In [49]:
from collections import Counter
Counter(FIN_DATA["category"])

Counter({'BUSINESS': 1381, 'SPORTS': 1381, 'CRIME': 1381, 'SCIENCE': 1381})

In [50]:
FIN_DATA["category"].replace({
   'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3 
},  inplace=True)

In [51]:
FIN_DATA.head(5)

Unnamed: 0,text,category
11967,GCC Business Leaders Remain Confident in the F...,0
2912,From the Other Side; an Honest Review from Emp...,0
3408,"Mike McDerment, CEO of FreshBooks, Talks About...",0
502,How to Market Your Business While Traveling th...,0
5279,How to Leverage Intuition in Decision-making I...,0


In [52]:
from sklearn.model_selection import train_test_split

In [54]:
x=FIN_DATA["text"]
y=FIN_DATA["category"]
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.20, random_state=2022,stratify=FIN_DATA["category"])

In [55]:
new_xtrain=[preprocess(text) for text in xtrain] 

In [56]:
new_xtrain[:4]

['ovulate Women Prefer Images Penetration Images oral sex canadian researcher find ovulation female genitalia responsive image penetration image oral sex difference significantly reduce non fertile phase menstrual cycle',
 'scientist discover Spooky Influence Baby Choices',
 'Olympic Race Walker step propose boyfriend Rio Winners',
 'beloved Bipedal Bear Named Pedals Believed kill Hunter upright walk bear see walk New Jersey injure paw']

In [57]:
xtrain[:4]

7589     Ovulating Women Prefer Images of Penetration O...
10442    Scientists Discover Spooky Influence On Baby N...
8792     Olympic Race Walker Steps Up To Propose To His...
1733     Beloved Bipedal Bear Named Pedals Believed Kil...
Name: text, dtype: object

In [58]:
Counter(ytest)

Counter({0: 276, 3: 276, 2: 276, 1: 277})

In [62]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('Multi NB',MultinomialNB())
])
clf.fit(xtrain,ytrain)
ypred=clf.predict(xtest)
print(classification_report(ypred,ytest))

              precision    recall  f1-score   support

           0       0.87      0.75      0.81       321
           1       0.80      0.93      0.86       240
           2       0.90      0.83      0.86       300
           3       0.80      0.90      0.85       244

    accuracy                           0.84      1105
   macro avg       0.84      0.85      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [63]:
FIN_DATA["new text"]=FIN_DATA["text"].apply(preprocess)

In [64]:
x=FIN_DATA["new text"]
y=FIN_DATA["category"]
nxtrain,nxtest,nytrain,nytest=train_test_split(x,y,test_size=0.20, random_state=2022,stratify=FIN_DATA["category"])

In [65]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer()),
    ('Multi NB',MultinomialNB())
])
clf.fit(nxtrain,nytrain)
ypred=clf.predict(nxtest)
print(classification_report(ypred,nytest))

              precision    recall  f1-score   support

           0       0.88      0.80      0.84       302
           1       0.82      0.94      0.87       241
           2       0.92      0.82      0.86       309
           3       0.83      0.91      0.87       253

    accuracy                           0.86      1105
   macro avg       0.86      0.87      0.86      1105
weighted avg       0.86      0.86      0.86      1105



In [66]:
clf=Pipeline([
    ('vectorizer_bow',CountVectorizer(ngram_range=(1,3))),
    ('Multi NB',MultinomialNB())
])
clf.fit(nxtrain,nytrain)
ypred=clf.predict(nxtest)
print(classification_report(ypred,nytest))

              precision    recall  f1-score   support

           0       0.89      0.79      0.84       313
           1       0.82      0.94      0.87       242
           2       0.92      0.83      0.87       306
           3       0.81      0.92      0.86       244

    accuracy                           0.86      1105
   macro avg       0.86      0.87      0.86      1105
weighted avg       0.86      0.86      0.86      1105

