In [1]:
import pandas as pd
import numpy as np
import spacy 

In [3]:
df = pd.read_json('news_dataset.json')

In [4]:
df.head()

Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS


In [5]:
df.shape

(7500, 2)

## Introductory Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7500 entries, 0 to 7499
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      7500 non-null   object
 1   category  7500 non-null   object
dtypes: object(2)
memory usage: 175.8+ KB


In [8]:
df.duplicated().sum()

3

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.reset_index(inplace=True)
df.drop('index',axis=1,inplace=True)

In [11]:
df.head()

Unnamed: 0,text,category
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS
3,This Richard Sherman Interception Literally Sh...,SPORTS
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS


In [12]:
df.category.value_counts()

category
SPORTS      2500
BUSINESS    2499
CRIME       2498
Name: count, dtype: int64

In [13]:
## No need to do sampling as the classes are almost similar

In [16]:
df['cat_num'] = df.category.map({
    'SPORTS':0,
    'BUSINESS':1,
    'CRIME':2
})

In [17]:
df.head()

Unnamed: 0,text,category,cat_num
0,"Larry Nassar Blames His Victims, Says He 'Was ...",CRIME,2
1,"Woman Beats Cancer, Dies Falling From Horse",CRIME,2
2,Vegas Taxpayers Could Spend A Record $750 Mill...,SPORTS,0
3,This Richard Sherman Interception Literally Sh...,SPORTS,0
4,7 Things That Could Totally Kill Weed Legaliza...,BUSINESS,1


In [18]:
nlp = spacy.load('en_core_web_lg')
df['vector'] = df['text'].apply(lambda text : nlp(text).vector)

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df.vector,df.cat_num,test_size=0.2,random_state=2022)
X_train.shape,X_test.shape

((5997,), (1500,))

In [22]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

In [24]:
X_train_2d

array([[-1.9859638 ,  0.17000434,  0.8270687 , ..., -0.71753645,
        -0.47226351,  0.8782776 ],
       [-1.7251331 ,  0.45386147, -0.24800351, ..., -0.4730072 ,
        -1.3516762 ,  0.95935327],
       [-0.7969609 ,  1.2998524 , -3.1629155 , ..., -0.5513114 ,
        -2.087531  ,  0.6071922 ],
       ...,
       [-2.217028  , -2.183671  ,  0.7241125 , ..., -0.90326166,
         0.34939885, -0.54419595],
       [-2.419291  ,  0.56445277, -1.4122539 , ...,  0.5097589 ,
        -1.1084539 ,  0.65251994],
       [-1.3108674 , -0.01383762,  0.18667895, ..., -2.320133  ,
        -2.3822267 ,  0.35827178]], dtype=float32)

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report


scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train_2d)
scaled_X_test = scaler.transform(X_test_2d)

model = MultinomialNB()
model.fit(scaled_X_train,y_train)

In [29]:
y_pred = model.predict(scaled_X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.72      0.63      0.67       506
           1       0.68      0.72      0.70       519
           2       0.68      0.72      0.70       475

    accuracy                           0.69      1500
   macro avg       0.69      0.69      0.69      1500
weighted avg       0.69      0.69      0.69      1500

