<a href="https://colab.research.google.com/github/deepak1195/NaturalLanguageProcessing/blob/main/009_BagOfN_Grams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import drive
drive.mount('/content/drive')
from numpy.ma.core import shape
path="/content/drive/MyDrive/myWork/Data/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df=pd.read_json(f'{path}news_dataset.json')
df

Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME
...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS


In [4]:
ctg=df.category.unique()
categories={k:v for k,v in zip(ctg,range(len(ctg)))}
categories

{'SCIENCE': 0, 'BUSINESS': 1, 'CRIME': 2, 'SPORTS': 3}

In [5]:
df['categories']=df.category.map(categories)
df

Unnamed: 0,text,category,categories
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,0
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,0
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,1
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,1
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2
...,...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS,3
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE,0
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE,0
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS,3


In [6]:
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp=spacy.load('en_core_web_sm')
def preprocessText(txt):
  doc=nlp(re.sub(r'[^\w\s]', '',txt))
  clean=[tk.lemma_ for tk in doc if not tk.is_stop]
  return ' '.join(clean).replace(' ,','')

In [7]:
df['pText']=df.text.apply(preprocessText)
df

Unnamed: 0,text,category,categories,pText
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,0,watch Schrödingers Cat die University Californ...
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,0,WATCH Freaky vortex open Flooded Lake
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,1,entrepreneur today not need Big Budget start n...
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,1,road recharge electric Car drive hightech high...
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2,Civilian Guard Fires Gun protect Recruiting Ce...
...,...,...,...,...
12690,Coach Shakes Hands Of Imaginary Players After ...,SPORTS,3,Coach Shakes Hands Imaginary Players oppose Te...
12691,This Minivan-Sized Sea Sponge Is Thought To Be...,SCIENCE,0,MinivanSized Sea Sponge think planet large anc...
12692,RECAP: Dramatic Eclipse Photos Don't miss the ...,SCIENCE,0,recap Dramatic Eclipse Photos not miss
12693,Richard Sherman Wants To Talk About Police Sho...,SPORTS,3,Richard Sherman want talk Police Shootings Gam...


In [8]:
df.categories.value_counts()

1    4254
3    4167
2    2893
0    1381
Name: categories, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest=train_test_split(df.pText,df.categories,test_size=0.25,stratify=df.categories,random_state=123)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

model=Pipeline([
    ('ngCV',CountVectorizer(ngram_range=(1,2))),  #Provide N-Gram Range
    ('NB',MultinomialNB())
])

model.fit(xTrain,yTrain)

In [11]:
from sklearn.metrics import classification_report
yPred=model.predict(xTest)
print(classification_report(yTest,yPred))

              precision    recall  f1-score   support

           0       0.99      0.44      0.61       345
           1       0.82      0.94      0.88      1064
           2       0.89      0.89      0.89       723
           3       0.90      0.92      0.91      1042

    accuracy                           0.87      3174
   macro avg       0.90      0.80      0.82      3174
weighted avg       0.88      0.87      0.86      3174



In [12]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(yTest,yPred)
cm

array([[ 153,  130,   13,   49],
       [   1, 1002,   30,   31],
       [   1,   50,  647,   25],
       [   0,   42,   39,  961]])

In [13]:
import plotly.express as px
px.imshow(cm, labels=dict(x="Predicted", y="Truth"), text_auto=True)