In [8]:
import os
import json
import spacy
import pandas as pd
from spacy_download import load_spacy


# Will download the model if it isn't installed yet
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")

In [9]:
df = pd.read_json('./News_Category_Dataset_v3.json', encoding='utf-8', lines=True)
df['cleaned_text'] = None

In [10]:
# I decided to merge similar categories to improve accuracy
df.category = df.category.map(lambda x: "WORLDPOST" if x == "THE WORLDPOST" else x)
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "ARTS" else x)
df.category = df.category.map(lambda x: "ARTS & CULTURE" if x == "CULTURE & ARTS" else x)
df.category = df.category.map(lambda x: "STYLE & BEAUTY" if x == "STYLE" else x)

In [11]:
categories = df.groupby('category')
num_topics = categories.size()

In [12]:
# trying to improve accuracy by combining headline and short description
df['text'] = df.headline + " " + df.short_description

In [13]:
def pre_process_document(text):
    doc = nlp(text, disable=['tagger', 'parser', 'ner'])
    filtered_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)

In [14]:
# must do some nlp processing to improve the topic modeling
# it looks like the topics generated used a lot of stop words
# aded a lemmatization step for better accuracy
df['text'] = df['text'].apply(pre_process_document)

KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,link,headline,category,short_description,authors,date,cleaned_text,text
0,https://www.huffpost.com/entry/covid-boosters-...,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...,"Carla K. Johnson, AP",2022-09-23,4 million Americans roll sleeve Omicron target...,Over 4 Million Americans Roll Up Sleeves For O...
1,https://www.huffpost.com/entry/american-airlin...,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...,Mary Papenfuss,2022-09-23,American Airlines Flyer charge ban life Punchi...,"American Airlines Flyer Charged, Banned For Li..."
2,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23,23 Funniest tweet Cats Dogs Week Sept. 17 23 d...,23 Of The Funniest Tweets About Cats And Dogs ...
3,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23,Funniest Tweets parent Week Sept. 17 23 accide...,The Funniest Tweets From Parents This Week (Se...
4,https://www.huffpost.com/entry/amy-cooper-lose...,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...,Nina Golgowski,2022-09-22,woman call cop Black Bird Watcher lose Lawsuit...,Woman Who Called Cops On Black Bird-Watcher Lo...


In [None]:
df.to_json('cleaned_kaggle_data.json', orient='records', lines=True)