In [4]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer(ngram_range = (1,3))
v.fit(["Thor Hathodawala is looking for a job"])
v.vocabulary_

{'thor': 12,
 'hathodawala': 2,
 'is': 5,
 'looking': 9,
 'for': 0,
 'job': 8,
 'thor hathodawala': 13,
 'hathodawala is': 3,
 'is looking': 6,
 'looking for': 10,
 'for job': 1,
 'thor hathodawala is': 14,
 'hathodawala is looking': 4,
 'is looking for': 7,
 'looking for job': 11}

In [5]:
corpus = [
    "Thor ate pizza",
    "Loki is tall",
    "Loki is eating pizza"
]

In [6]:
import spacy 

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    
    filtered_tokens = []
    
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return ' '.join(filtered_tokens)

2022-12-22 11:11:43.737444: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-22 11:11:43.940424: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-22 11:11:43.940444: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-22 11:11:44.797390: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [7]:
preprocess("Loki is eating pizza")

'Loki eat pizza'

In [8]:
corpus_processed = [preprocess(text) for text in corpus]
corpus_processed

['Thor eat pizza', 'Loki tall', 'Loki eat pizza']

In [10]:
v = CountVectorizer(ngram_range = (1,2))
v.fit(corpus_processed)
v.vocabulary_

{'thor': 7,
 'eat': 0,
 'pizza': 5,
 'thor eat': 8,
 'eat pizza': 1,
 'loki': 2,
 'tall': 6,
 'loki tall': 4,
 'loki eat': 3}

In [12]:
v.transform(["Thor eat pizza"]).toarray()

array([[1, 1, 0, 0, 0, 1, 0, 1, 1]])

In [14]:
import pandas as pd

df = pd.read_json("news_dataset.json")

print(df.shape)

df.head()

(12695, 2)


Unnamed: 0,text,category
0,Watching Schrödinger's Cat Die University of C...,SCIENCE
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS
3,These Roads Could Recharge Your Electric Car A...,BUSINESS
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME


In [17]:
df.category.value_counts()

BUSINESS    4254
SPORTS      4167
CRIME       2893
SCIENCE     1381
Name: category, dtype: int64

In [19]:
min_samples = 1381

df_business = df[df.category == "BUSINESS"].sample(min_samples, random_state = 1000)
df_business
df_sports = df[df.category == "SPORTS"].sample(min_samples, random_state = 1000)
df_crime = df[df.category == "CRIME"].sample(min_samples, random_state = 1000)
df_science = df[df.category == "SCIENCE"].sample(min_samples, random_state = 1000)

In [21]:
df_balanced = pd.concat([df_business, df_sports, df_crime, df_science], axis = 0)
df_balanced.category.value_counts()

BUSINESS    1381
SPORTS      1381
CRIME       1381
SCIENCE     1381
Name: category, dtype: int64

In [22]:
target = {'BUSINESS': 0, 'SPORTS': 1, 'CRIME': 2, 'SCIENCE': 3}

df_balanced['category_number'] = df_balanced.category.map(target)

In [44]:
df['category_number'] = df.category.map(target)

In [23]:
df_balanced.head()

Unnamed: 0,text,category,category_number
5318,The Job Market Is Still Years Away From A Full...,BUSINESS,0
6286,Establishing a Solid Legal Foundation for Your...,BUSINESS,0
3320,"Gender Diversity on Boards: Good, Bad or Indif...",BUSINESS,0
2844,Volunteering Surprisingly Makes You Feel Like ...,BUSINESS,0
9903,How Managers Can Hire Employees More Effectively,BUSINESS,0


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_balanced.text, 
                     df_balanced.category_number, 
                     test_size =0.2, 
                     random_state = 1000, 
                     stratify = df_balanced.category_number)

In [35]:
y_train.head()

8669     3
10962    1
9543     0
6728     0
588      1
Name: category_number, dtype: int64

In [37]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,1))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.76      0.82       324
           1       0.81      0.87      0.84       257
           2       0.90      0.85      0.87       292
           3       0.77      0.91      0.83       232

    accuracy                           0.84      1105
   macro avg       0.84      0.85      0.84      1105
weighted avg       0.85      0.84      0.84      1105



In [38]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.70      0.80       365
           1       0.79      0.89      0.84       243
           2       0.89      0.86      0.88       285
           3       0.71      0.93      0.81       212

    accuracy                           0.83      1105
   macro avg       0.83      0.85      0.83      1105
weighted avg       0.85      0.83      0.83      1105



In [39]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,3))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.93      0.69      0.79       374
           1       0.77      0.89      0.83       237
           2       0.89      0.87      0.88       284
           3       0.71      0.94      0.81       210

    accuracy                           0.83      1105
   macro avg       0.83      0.85      0.83      1105
weighted avg       0.84      0.83      0.82      1105



In [40]:
df_balanced['preprocessed_txt'] = df_balanced.text.apply(preprocess)

In [41]:
df_balanced.head()

Unnamed: 0,text,category,category_number,preprocessed_txt
5318,The Job Market Is Still Years Away From A Full...,BUSINESS,0,Job market year away Recovery
6286,Establishing a Solid Legal Foundation for Your...,BUSINESS,0,establish Solid Legal Foundation business trad...
3320,"Gender Diversity on Boards: Good, Bad or Indif...",BUSINESS,0,Gender Diversity board good bad Indifferent
2844,Volunteering Surprisingly Makes You Feel Like ...,BUSINESS,0,volunteer surprisingly make feel like free time
9903,How Managers Can Hire Employees More Effectively,BUSINESS,0,manager hire employee effectively


In [42]:
X_train, X_test, y_train, y_test = train_test_split(
                    df_balanced.preprocessed_txt, 
                     df_balanced.category_number, 
                     test_size =0.2, 
                     random_state = 1000, 
                     stratify = df_balanced.category_number)

In [43]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.91      0.81      0.86       307
           1       0.82      0.89      0.85       254
           2       0.92      0.84      0.88       302
           3       0.80      0.92      0.86       242

    accuracy                           0.86      1105
   macro avg       0.86      0.87      0.86      1105
weighted avg       0.87      0.86      0.86      1105



In [47]:
df.head()

Unnamed: 0,text,category,category_number
0,Watching Schrödinger's Cat Die University of C...,SCIENCE,3
1,WATCH: Freaky Vortex Opens Up In Flooded Lake,SCIENCE,3
2,Entrepreneurs Today Don't Need a Big Budget to...,BUSINESS,0
3,These Roads Could Recharge Your Electric Car A...,BUSINESS,0
4,Civilian 'Guard' Fires Gun While 'Protecting' ...,CRIME,2


In [51]:
df['preprocessed_txt'] = df.text.apply(preprocess)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
                    df.text, 
                     df.category_number, 
                     test_size =0.2, 
                     stratify = df.category_number)

In [49]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.96      0.75      0.84      1082
           1       0.89      0.91      0.90       814
           2       0.85      0.91      0.88       541
           3       0.36      0.98      0.53       102

    accuracy                           0.84      2539
   macro avg       0.76      0.89      0.79      2539
weighted avg       0.89      0.84      0.86      2539



In [50]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,1))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.94      0.84      0.88       951
           1       0.91      0.91      0.91       833
           2       0.90      0.89      0.90       587
           3       0.59      0.98      0.74       168

    accuracy                           0.88      2539
   macro avg       0.84      0.90      0.86      2539
weighted avg       0.90      0.88      0.89      2539



In [52]:
X_train, X_test, y_train, y_test = train_test_split(
                    df.preprocessed_txt, 
                     df.category_number, 
                     test_size =0.2, 
                     stratify = df.category_number)

In [53]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,2))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.94      0.80      0.87      1003
           1       0.92      0.90      0.91       856
           2       0.88      0.90      0.89       562
           3       0.43      1.00      0.60       118

    accuracy                           0.87      2539
   macro avg       0.79      0.90      0.82      2539
weighted avg       0.90      0.87      0.87      2539



In [54]:
clf = Pipeline([
    ('vectorizer_bow', CountVectorizer(ngram_range= (1,1))),
    ('Multi NB', MultinomialNB())
])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       912
           1       0.91      0.92      0.92       830
           2       0.90      0.86      0.88       607
           3       0.63      0.92      0.75       190

    accuracy                           0.88      2539
   macro avg       0.84      0.89      0.86      2539
weighted avg       0.89      0.88      0.88      2539

