In [None]:
#handling imbalanced dataset

import spacy
import pandas as pd


In [None]:
#dataset link https://github.com/codebasics/nlp-tutorials/blob/main/11_bag_of_n_grams/news_dataset.json


df = pd.read_json("news_dataset.json")
print(df.shape)
df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [5]:
df.category.value_counts()

category
BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: count, dtype: int64

In [6]:
min_sample = 100

In [8]:
min_business = df[df.category == "BUSINESS"].sample(min_sample, random_state=111)
min_SPORTS = df[df.category == "SPORTS"].sample(min_sample, random_state=111)
min_CRIME = df[df.category == "CRIME"].sample(min_sample, random_state=111)
min_SCIENCE = df[df.category == "SCIENCE"].sample(min_sample, random_state=111)

In [14]:
#adding the minimum balanced sample dataset to a new balanced dataset which is balanced

df_min_balanced = pd.concat([min_business, min_SPORTS, min_CRIME, min_SCIENCE], axis=0)
df_min_balanced.category.value_counts()

category
BUSINESS    100
SPORTS      100
CRIME       100
SCIENCE     100
Name: count, dtype: int64

In [None]:
#converting category labels to numbers for model training
df_min_balanced['category_num'] = df_min_balanced.category.map({'BUSINESS':0, 'CRIME':1, 'SCIENCE':2, 'SPORTS':3})
df_min_balanced.head()

Unnamed: 0,text,category,category_num
289,"Toys R Us May Shut Down All U.S. Operations, I...",BUSINESS,0
10628,The Winners And Losers Of Plummeting Oil Prices,BUSINESS,0
7032,Obstacles to Creative Disruption,BUSINESS,0
7694,"I'll Have My Beer For-Profit, Please I was in ...",BUSINESS,0
6755,If Your Office Put A Calorie Counter Next To T...,BUSINESS,0


In [None]:
from sklearn.model_selection import train_test_split


In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df_min_balanced.text, 
    df_min_balanced.category_num, 
    test_size=0.2, 
    random_state=111,
    stratify=df_min_balanced.category_num  #it will ensure that the train and test set have same distribution of categories as original dataset
    )


In [18]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(320,) (80,) (320,) (80,)


In [19]:
y_train.value_counts()

category_num
0    80
1    80
3    80
2    80
Name: count, dtype: int64

In [23]:
#creating a pipeline with countvectorizer and multinomialNB

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report




In [22]:
classifier = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(1,2))),  #using grams
    ('nb', MultinomialNB())
])

In [24]:
classifier.fit(X_train, y_train)  #training the model using the training dataset 


0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [None]:
y_predct = classifier.predict(X_test)  #predicting the labels for test dataset



In [26]:
#printing the classification report
print(classification_report(y_test, y_predct))

              precision    recall  f1-score   support

           0       0.46      0.85      0.60        20
           1       0.73      0.55      0.63        20
           2       0.75      0.45      0.56        20
           3       0.94      0.75      0.83        20

    accuracy                           0.65        80
   macro avg       0.72      0.65      0.66        80
weighted avg       0.72      0.65      0.66        80



In [None]:
"""In above code we dont used any preprocessing techniques
so below we will use preprocessing techniques and see the difference in results 
    
    """

In [28]:
# preprocessing the text data
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if  token.is_stop or token.is_punct:
            continue
        tokens.append(token.lemma_)
    return " ".join(tokens)

In [29]:
df_min_balanced["preprocessed_text"] = df_min_balanced["text"].apply(preprocess_text)

In [30]:
df_min_balanced.head()

Unnamed: 0,text,category,category_num,preprocessed_text
289,"Toys R Us May Shut Down All U.S. Operations, I...",BUSINESS,0,toy r shut U.S. Operations impact thousand Wor...
10628,The Winners And Losers Of Plummeting Oil Prices,BUSINESS,0,Winners Losers plummet Oil price
7032,Obstacles to Creative Disruption,BUSINESS,0,obstacle Creative Disruption
7694,"I'll Have My Beer For-Profit, Please I was in ...",BUSINESS,0,beer Profit northeast Portland weekend decide ...
6755,If Your Office Put A Calorie Counter Next To T...,BUSINESS,0,office calorie Counter staircase elevator Alab...


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df_min_balanced.preprocessed_text, 
    df_min_balanced.category_num, 
    test_size=0.2, 
    random_state=111,
    stratify=df_min_balanced.category_num  #it will ensure that the train and test set have same distribution of categories as original dataset
    )

In [32]:
classifier.fit(X_train, y_train)


0,1,2
,steps,"[('vectorizer', ...), ('nb', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [33]:
y_predict_new = classifier.predict(X_test)

In [34]:
print(classification_report(y_test, y_predict_new))

              precision    recall  f1-score   support

           0       0.60      0.90      0.72        20
           1       0.70      0.80      0.74        20
           2       0.82      0.45      0.58        20
           3       0.94      0.75      0.83        20

    accuracy                           0.72        80
   macro avg       0.76      0.73      0.72        80
weighted avg       0.76      0.72      0.72        80

