In [1]:
import json
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [2]:
data = pd.read_csv('./data/news_train.csv')
data.head()

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160735 entries, 0 to 160734
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  160735 non-null  object
 1   headline  160730 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [4]:
data.dropna(axis = 0, inplace = True)

In [5]:
json_data = pd.read_json('./data/News_Category_Dataset_v3.json', lines = True)[['headline', 'category']]

In [6]:
data.info(), json_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160730 entries, 0 to 160734
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  160730 non-null  object
 1   headline  160730 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  209527 non-null  object
 1   category  209527 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


(None, None)

In [7]:
categories = ['CRIME', 'POLITICS', 'ENTERTAINMENT', 'WELLNESS', 'BUSINESS', 'SPORTS', 'WORLD NEWS', 'SCIENCE', 'TECH', 'MONEY']

In [8]:
data['Flag'] = data.category.apply(lambda x : x in categories)
data = data.loc[data.Flag]
json_data['Flag'] = json_data.category.apply(lambda x: x in categories)
json_data = json_data.loc[json_data.Flag]

In [9]:
json_data.category.value_counts()

category
POLITICS         35602
WELLNESS         17945
ENTERTAINMENT    17362
BUSINESS          5992
SPORTS            5077
CRIME             3562
WORLD NEWS        3299
SCIENCE           2206
TECH              2104
MONEY             1756
Name: count, dtype: int64

In [10]:
data.category.value_counts()

category
POLITICS         26273
WELLNESS         14289
ENTERTAINMENT    12744
BUSINESS          4750
SPORTS            3941
CRIME             2687
SCIENCE           1770
WORLD NEWS        1756
TECH              1639
MONEY             1374
Name: count, dtype: int64

In [11]:
data = pd.concat([data, json_data])
data.reset_index(drop = True, inplace= True)
data.drop(columns=['Flag'], inplace=True)
data.head()

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [12]:
len(data)

166128

In [13]:
data.drop_duplicates(inplace=True, ignore_index=True)
len(data)

94611

In [14]:
data['category'] = data['category'].apply(lambda x : x.lower().split()[0])
data.head()

Unnamed: 0,category,headline
0,crime,There Were 2 Mass Shootings In Texas Last Week...
1,entertainment,Will Smith Joins Diplo And Nicky Jam For The 2...
2,entertainment,Hugh Grant Marries For The First Time At Age 57
3,entertainment,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,entertainment,Julianna Margulies Uses Donald Trump Poop Bags...


In [15]:
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
tqdm.pandas()
stop = stopwords.words('english')

In [16]:
def clean_text(text):
    tokeniser = WordPunctTokenizer()
    lemmatiser = WordNetLemmatizer()
    tokens = tokeniser.tokenize(text)
    return ' '.join([lemmatiser.lemmatize(token.lower()) for token in tokens if token.isalpha() and len(token) > 3 and token not in stop])

In [17]:
data['clean'] = data['headline'].progress_apply(lambda x : clean_text(x))

100%|██████████| 94611/94611 [00:03<00:00, 25515.83it/s]


In [18]:
class HeadlineDataset(Dataset):
    def __init__(self, data):
        self.text = data['clean']
        self.label = data['category']
    
    def __len__(self):
        return len(self.text)
    


Unnamed: 0,category,headline,clean
0,crime,There Were 2 Mass Shootings In Texas Last Week...,there were mass shooting texas last week only
1,entertainment,Will Smith Joins Diplo And Nicky Jam For The 2...,will smith join diplo nicky world official song
2,entertainment,Hugh Grant Marries For The First Time At Age 57,hugh grant marries first time
3,entertainment,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,carrey blast castrato adam schiff democrat art...
4,entertainment,Julianna Margulies Uses Donald Trump Poop Bags...,julianna margulies us donald trump poop bag pi...
...,...,...,...
94606,science,Thomas Edison Voted Most Iconic Inventor In U....,thomas edison voted most iconic inventor history
94607,business,Four More Bank Closures Mark the Week of Janua...,four more bank closure mark week january
94608,business,Walmart Waving Goodbye To Some Greeters,walmart waving goodbye some greeter
94609,entertainment,'Girl With the Dragon Tattoo' India Release Ca...,girl with dragon tattoo india release canceled...
