In [3]:
!pip install gensim nltk scikit-learn pandas joblib



In [4]:
import kagglehub
rmisra_news_category_dataset_path = kagglehub.dataset_download('rmisra/news-category-dataset')

print('Data source import complete.')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Using Colab cache for faster access to the 'news-category-dataset' dataset.
Data source import complete.
/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json


In [5]:
import os
os.getcwd()

'/content'

In [6]:
train_data = pd.read_json('../kaggle/input/news-category-dataset/News_Category_Dataset_v3.json', lines=True)
train_data.shape

(209527, 6)

# **Observation:**
- There are 6 features in the given data.
- The only useful ones in this scenario are `headline` and the `short_description`.
- Dropping the rest of them would help reduce noise.
- Next step should be to find any missing values.

In [7]:
train_data = train_data[['headline', 'short_description']]

train_data.head()

Unnamed: 0,headline,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,Amy Cooper accused investment firm Franklin Te...


In [8]:
for feature in train_data.columns:
    print(f'{feature}: {train_data[feature].isnull().sum()}')

headline: 0
short_description: 0


# **Observation:**
- No missing values. Hence proceeding with the next steps.
- The only thing I need is the natural text. There is no requirement of two separate features like `headline` and `short_description`

## Next Step:
- Merging both the features to get a simpler structure of dataset.

In [9]:
train_data['document'] = train_data['headline'] + ' ' + train_data['short_description']
train_data.drop(columns=['headline', 'short_description'], inplace=True)

In [10]:
train_data.head()

Unnamed: 0,document
0,Over 4 Million Americans Roll Up Sleeves For O...
1,"American Airlines Flyer Charged, Banned For Li..."
2,23 Of The Funniest Tweets About Cats And Dogs ...
3,The Funniest Tweets From Parents This Week (Se...
4,Woman Who Called Cops On Black Bird-Watcher Lo...


# **Observation:**
- The Dataset is good to be proceeded for preprocessing part.

# Next Step:
- **Preprocessing:**
  - Lowercasing
  - Link removal
  - Numeric value removal
  - HTML Tags removal
  - Emoji removal
  - Stopword removal
  - Punctuations removal
  - Lemmatization

In [11]:
import numpy as np
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize # for stopword removal we first need to tokenize
from nltk.stem import WordNetLemmatizer

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt') # required for tokenization
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
row_count = train_data.shape[0]
print(f'Total number of rows: {row_count}')

Total number of rows: 209527


In [14]:
total_count = row_count
curr_count = 0

def text_preprocessing(text):
    if not isinstance(text, str):
      return ""

    global curr_count

    text = text.lower()

    text = text.translate(str.maketrans('', '', string.punctuation))

    text = re.sub(r"http\S+", "", text)

    text = re.sub(r"<.*?>", "", text)

    # numeric value removal
    text = re.sub(r'\d+', '', text)

    # emoji removal
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # stopword removal
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)

    cleaned_tokens = [word for word in word_tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in cleaned_tokens]

    curr_count += 1
    print(f'\rProcessed: {(curr_count/total_count) * 100}%', end='', flush=True)

    return " ".join(lemmatized_tokens)


train_data['cleaned_document'] = train_data['document'].apply(text_preprocessing)

print("Preprocessing Completed...")

train_data.head()

Processed: 100.0%Preprocessing Completed...


Unnamed: 0,document,cleaned_document
0,Over 4 Million Americans Roll Up Sleeves For O...,million american roll sleeve omicrontargeted c...
1,"American Airlines Flyer Charged, Banned For Li...",american airline flyer charged banned life pun...
2,23 Of The Funniest Tweets About Cats And Dogs ...,funniest tweet cat dog week sept dog dont unde...
3,The Funniest Tweets From Parents This Week (Se...,funniest tweet parent week sept accidentally p...
4,Woman Who Called Cops On Black Bird-Watcher Lo...,woman called cop black birdwatcher loses lawsu...


# **Observation:**
- The data has now been cleaned.

## Next Step:
- We have to create vector embeddings of words using the Word2Vec.
- The vocabulary needs to be stored along with the vectorizer model in order to provide the simantc similarity.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=20000,
    min_df=5,
    max_df=0.9
)

tfidf_matrix = tfidf.fit_transform(train_data['cleaned_document'])

print("Vocab size: ", len(tfidf.get_feature_names_out()))

Vocab size:  20000


In [19]:
from gensim.models import Word2Vec

tokenized_docs = [doc.split() for doc in train_data['cleaned_document']]

w2v_cbow = Word2Vec(
    sentences=tokenized_docs,
    vector_size=200,
    window=8,
    min_count=5,
    sg=0,
    workers=4,
    sample=1e-3,
    epochs=15
)

In [20]:
w2v_sg = Word2Vec(
    sentences=tokenized_docs,
    vector_size=200,
    window=8,
    min_count=5,
    sg=1,
    workers=4,
    sample=1e-3,
    epochs=15
)

In [21]:
print("CBOW:", w2v_cbow.wv.most_similar("technology", topn=5))
print("SG  :", w2v_sg.wv.most_similar("technology", topn=5))


CBOW: [('technological', 0.6013023853302002), ('digital', 0.5953191518783569), ('innovation', 0.5764516592025757), ('device', 0.5745869874954224), ('smartphones', 0.5589178204536438)]
SG  : [('technological', 0.5771042704582214), ('automation', 0.5734493136405945), ('interface', 0.5652498602867126), ('telehealth', 0.5634194612503052), ('computing', 0.5630481839179993)]


In [26]:
import joblib

joblib.dump(tfidf, "tfidf_vectorizer.pkl")
w2v_cbow.save("word2vec_cbow.model")
w2v_sg.save("word2vec_skipgram.model")

In [27]:
from collections import Counter

word_freq = Counter([w for doc in tokenized_docs for w in doc])
joblib.dump(word_freq, "word_frequencies.pkl")

from google.colab import files
files.download("word_frequencies.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
from google.colab import files

files.download("tfidf_vectorizer.pkl")
files.download("word2vec_cbow.model")
files.download("word2vec_skipgram.model")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>