In [3]:
import json
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader, Dataset
from google.colab import drive
import torch

In [4]:
drive.mount('/content/drive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
data = pd.read_csv('/content/drive/My Drive/Headline Classification/data/news_train.csv')
data.head()

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160735 entries, 0 to 160734
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  160735 non-null  object
 1   headline  160730 non-null  object
dtypes: object(2)
memory usage: 2.5+ MB


In [10]:
data.dropna(axis = 0, inplace = True)

In [11]:
json_data = pd.read_json('/content/drive/My Drive/Headline Classification/data/News_Category_Dataset_v3.json', lines = True)[['headline', 'category']]

In [12]:
data.info(), json_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160730 entries, 0 to 160734
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   category  160730 non-null  object
 1   headline  160730 non-null  object
dtypes: object(2)
memory usage: 3.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   headline  209527 non-null  object
 1   category  209527 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


(None, None)

In [13]:
categories = ['CRIME', 'POLITICS', 'ENTERTAINMENT', 'WELLNESS', 'BUSINESS', 'SPORTS', 'WORLD NEWS', 'SCIENCE', 'TECH', 'MONEY']

In [14]:
data['Flag'] = data.category.apply(lambda x : x in categories)
data = data.loc[data.Flag]
json_data['Flag'] = json_data.category.apply(lambda x: x in categories)
json_data = json_data.loc[json_data.Flag]

In [15]:
json_data.category.value_counts()

POLITICS         35602
WELLNESS         17945
ENTERTAINMENT    17362
BUSINESS          5992
SPORTS            5077
CRIME             3562
WORLD NEWS        3299
SCIENCE           2206
TECH              2104
MONEY             1756
Name: category, dtype: int64

In [16]:
data.category.value_counts()

POLITICS         26273
WELLNESS         14289
ENTERTAINMENT    12744
BUSINESS          4750
SPORTS            3941
CRIME             2687
SCIENCE           1770
WORLD NEWS        1756
TECH              1639
MONEY             1374
Name: category, dtype: int64

In [17]:
data = pd.concat([data, json_data])
data.reset_index(drop = True, inplace= True)
data.drop(columns=['Flag'], inplace=True)
data.head()

Unnamed: 0,category,headline
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...


In [18]:
len(data)

166128

In [19]:
data.drop_duplicates(inplace=True, ignore_index=True)
len(data)

94611

In [20]:
data['category'] = data['category'].apply(lambda x : x.lower().split()[0])
data.head()

Unnamed: 0,category,headline
0,crime,There Were 2 Mass Shootings In Texas Last Week...
1,entertainment,Will Smith Joins Diplo And Nicky Jam For The 2...
2,entertainment,Hugh Grant Marries For The First Time At Age 57
3,entertainment,Jim Carrey Blasts 'Castrato' Adam Schiff And D...
4,entertainment,Julianna Margulies Uses Donald Trump Poop Bags...


In [31]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
tqdm.pandas()
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [32]:
def clean_text(text):
    tokeniser = WordPunctTokenizer()
    lemmatiser = WordNetLemmatizer()
    tokens = tokeniser.tokenize(text)
    return ' '.join([lemmatiser.lemmatize(token.lower()) for token in tokens if token.isalpha() and len(token) > 3 and token not in stop])

In [33]:
data['clean'] = data['headline'].progress_apply(lambda x : clean_text(x))

100%|██████████| 94611/94611 [00:08<00:00, 11090.56it/s]


In [48]:
class HeadlineDataset(Dataset):
    def __init__(self, data):
        self.text = data['clean']
        self.label = data['category']

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx], self.label[idx]

In [49]:
dataset = HeadlineDataset(data)

In [61]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for i in range(len(data_iter)):
        text = data_iter[i][0]
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(dataset))

text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(
    dataset, batch_size=8, shuffle=False, collate_fn=collate_batch
)

In [62]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [68]:
class Classifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, output_size, device):
        super(Classifier, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_size, sparse=False)
        self.fc1 = nn.Linear(embedding_size, embedding_size, device = device)
        self.fc2 = nn.Linear(embedding_size, output_size, device = device)

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        x = self.embedding(x)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x))
        return x
num_class = len(set([label for (label, text) in dataset]))
vocab_size = len(vocab)
emsize = 64
model = Classifier(vocab_size, emsize, num_class, device)

KeyError: 94611

In [67]:
import time


def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(
                    epoch, idx, len(dataloader), total_acc / total_count
                )
            )
            total_acc, total_count = 0, 0
            start_time = time.time()


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [None]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10  # epoch
LR = 5  # learning rate
BATCH_SIZE = 64  # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None
train_iter, test_iter = AG_NEWS()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(
    train_dataset, [num_train, len(train_dataset) - num_train]
)

train_dataloader = DataLoader(
    split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
valid_dataloader = DataLoader(
    split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch
)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)