In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from my_utils import save_to_pickle, load_from_pickle
import string
from tqdm import tqdm
from collections import OrderedDict
import torchtext
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader


In [2]:
dataset = load_dataset("json", data_files="data/News_Category_Dataset_v3.json", split=["train"])[0]
dataset

Using custom data configuration default-7e666937cb3173ed


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-7e666937cb3173ed/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7e666937cb3173ed/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
    num_rows: 209527
})

In [3]:
ds1 = dataset.train_test_split(0.2, seed=42)
ds2 = ds1['train'].train_test_split(0.25, seed=42)
ds_train = ds2['train']
ds_val = ds2['test']
ds_test = ds1['test']

print(ds_train)
print(ds_val)
print(ds_test)

Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
    num_rows: 125715
})
Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
    num_rows: 41906
})
Dataset({
    features: ['link', 'headline', 'category', 'short_description', 'authors', 'date'],
    num_rows: 41906
})


In [4]:
ds_train[2]

{'link': 'https://www.huffingtonpost.com/entry/nicki-minaj-soulja-boy-yasss-bish_n_5260517.html',
 'headline': 'Nicki Minaj Addresses Donald Sterling Controversy In New Song',
 'category': 'ENTERTAINMENT',
 'short_description': '',
 'authors': 'Matthew Jacobs',
 'date': datetime.datetime(2014, 5, 3, 0, 0)}

In [5]:
categories = set(dataset['category'])
print(categories)
category_map = dict(zip(categories, range(len(categories))))

{'WELLNESS', 'GOOD NEWS', 'TASTE', 'IMPACT', 'ENTERTAINMENT', 'TRAVEL', 'TECH', 'PARENTING', 'WORLDPOST', 'U.S. NEWS', 'CRIME', 'HEALTHY LIVING', 'HOME & LIVING', 'SCIENCE', 'BLACK VOICES', 'EDUCATION', 'GREEN', 'WORLD NEWS', 'ENVIRONMENT', 'POLITICS', 'QUEER VOICES', 'WEIRD NEWS', 'PARENTS', 'STYLE', 'ARTS', 'BUSINESS', 'FIFTY', 'COLLEGE', 'ARTS & CULTURE', 'SPORTS', 'CULTURE & ARTS', 'THE WORLDPOST', 'MEDIA', 'DIVORCE', 'STYLE & BEAUTY', 'FOOD & DRINK', 'RELIGION', 'MONEY', 'COMEDY', 'WOMEN', 'WEDDINGS', 'LATINO VOICES'}


In [6]:
# calculate the amount of articles per category
# categories = set(dataset['category'])
# cat_count = {cat: dataset['category'].count(cat) for cat in categories}
# cat_count

{'FIFTY': 1401,
 'GREEN': 2622,
 'BLACK VOICES': 4583,
 'MONEY': 1756,
 'LATINO VOICES': 1130,
 'STYLE & BEAUTY': 9814,
 'TECH': 2104,
 'COLLEGE': 1144,
 'THE WORLDPOST': 3664,
 'WELLNESS': 17945,
 'WORLD NEWS': 3299,
 'ARTS & CULTURE': 1339,
 'COMEDY': 5400,
 'TRAVEL': 9900,
 'WORLDPOST': 2579,
 'ARTS': 1509,
 'DIVORCE': 3426,
 'WEDDINGS': 3653,
 'RELIGION': 2577,
 'FOOD & DRINK': 6340,
 'SCIENCE': 2206,
 'WEIRD NEWS': 2777,
 'CRIME': 3562,
 'PARENTING': 8791,
 'BUSINESS': 5992,
 'WOMEN': 3572,
 'STYLE': 2254,
 'ENTERTAINMENT': 17362,
 'HEALTHY LIVING': 6694,
 'EDUCATION': 1014,
 'QUEER VOICES': 6347,
 'U.S. NEWS': 1377,
 'TASTE': 2096,
 'SPORTS': 5077,
 'ENVIRONMENT': 1444,
 'PARENTS': 3955,
 'POLITICS': 35602,
 'HOME & LIVING': 4320,
 'GOOD NEWS': 1398,
 'MEDIA': 2944,
 'CULTURE & ARTS': 1074,
 'IMPACT': 3484}

In [7]:
# tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
class MyTokenizer():
    def __init__(self):
        pass
    
    def __call__(self, text):
        # lowercase, remove punctuation, and split by space
        out = text.lower().replace("\n","").replace('t','').translate(str.maketrans('', '', string.punctuation)).split(" ")
        res = []
        for i, word in enumerate(out):
            if (any(char.isnumeric() for char in word)):
                for letter in word:
                    res.append(letter)
            else:
                res.append(word)
        return res

In [9]:
sentence = '"Until you have a dog you1 don\'t understand\n what could be eaten."'
tokenizer = MyTokenizer()
print(tokenizer(sentence))

['unil', 'you', 'have', 'a', 'dog', 'y', 'o', 'u', '1', 'don', 'undersand', 'wha', 'could', 'be', 'eaen']


In [10]:
# find all words in the text
def find_all_tokens(dataset, tokenizer):
    try: all_tokens = load_from_pickle("pickle/all_tokens.pickle")
    except:
        all_tokens = set()
        for d in [dataset['headline'], dataset['short_description']]:
            for sentence in tqdm(d):
                for token in tokenizer(sentence):
                    all_tokens.add(token)
        all_tokens = list(all_tokens)
        all_tokens.sort()
        save_to_pickle(all_tokens, "pickle/all_tokens.pickle")
    return all_tokens

def calc_token_counts(dataset, tokenizer):
    try: token_counts = load_from_pickle("pickle/token_counts.pickle")
    except:
        token_counts = OrderedDict()
        for d in [dataset['headline'], dataset['short_description']]:
            for sentence in tqdm(d):
                for token in tokenizer(sentence):
                    if token in token_counts.keys():
                        token_counts[token] += 1
                    else:
                        token_counts[token] = 1
        save_to_pickle(token_counts, "pickle/token_counts.pickle")
    return token_counts


            
all_tokens = find_all_tokens(dataset, tokenizer)
token_counts = calc_token_counts(dataset, tokenizer)
print(len(all_tokens))
print(len(token_counts.keys()))

119119
119119


In [11]:
# create the vocab
unk_token = '<unk>'
vocab = torchtext.vocab.vocab(
    token_counts,
    min_freq=10,
    specials=[unk_token], 
    special_first=True
)
vocab.set_default_index(vocab[unk_token])

print(len(vocab))
vocab[unk_token], vocab['hi'], vocab['dog'], vocab['dawg'], vocab['doggo']

23975


(0, 2104, 1253, 14115, 0)

In [70]:
# first, a dataloader:

def collate_fn(data):
    # return list(map(lambda x: (x['headline'], x['short_description'], x['category']), data))
    batch = {'headline': [], 'short_description': []}
    targ = []
    for item in data:
        batch['headline'].append(item['headline'])
        batch['short_description'].append(item['short_description'])
        targ.append(item['category'])
    targ = torch.tensor(list(map(lambda x: category_map[x], targ)), dtype=torch.int64)
    return batch, targ

micro_loader = DataLoader(ds_train.select(range(64)), collate_fn=collate_fn, batch_size=64, shuffle=True)
train_loader = DataLoader(ds_train, collate_fn=collate_fn, batch_size=64, shuffle=True)
val_loader   = DataLoader(ds_val, collate_fn=collate_fn, batch_size=64, shuffle=True)
test_loader  = DataLoader(ds_test, collate_fn=collate_fn, batch_size=64, shuffle=True)   


In [65]:
class PreprocessingModule(nn.Module):
    def __init__(self, tokenizer, vocab):
        super().__init__()
        self.tokenizer = tokenizer
        self.vocab = vocab
    
    def forward(self, x):
        headlines, descriptions = x['headline'], x['short_description']
        h_tok, d_tok = [], []
        h_longest, d_longest = 0, 0
        bs = len(headlines)
        
        for i in range(bs):
            h_tok.append(self.vocab(self.tokenizer(headlines[i])))      
            d_tok.append(self.vocab(self.tokenizer(descriptions[i])))
            h_longest = max(h_longest, len(h_tok[-1]))
            d_longest = max(d_longest, len(d_tok[-1]))
        h_ten, d_ten = torch.zeros((bs,h_longest), dtype=torch.int32), torch.zeros((bs,d_longest), dtype=torch.int32)
                
        for i in range(bs):
            h_ten[i,0:len(h_tok[i])] = torch.tensor(h_tok[i], dtype=torch.int32)
            d_ten[i,0:len(d_tok[i])] = torch.tensor(d_tok[i], dtype=torch.int32)
        return h_ten.cuda(), d_ten.cuda()
            

In [48]:
# custom embedding module - we'll have to train the embeddings

# for embedding both the headline and description
class EmbedItemModule(nn.Module):
    def __init__(self, tokenizer, vocab, dim):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), dim)
    
    def forward(self, x):
        h_ten, d_ten = x
        emb_h = self.embedding(h_ten)
        emb_d = self.embedding(d_ten)
        return emb_h, emb_d

In [49]:
# for the baseline encoder, we just take the mean of the elements
class BaselineEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, x):
        emb_h, emb_d = x
        u, v = emb_h.mean(axis=1), emb_d.mean(axis=1)
        return torch.hstack([u,v])

In [50]:
class MLP(nn.Module):
    def __init__(self, d_in, d_hid, d_out):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(d_in, d_hid),
            nn.ReLU(),
            nn.Linear(d_hid, d_out),
            nn.Softmax(dim=1),
        )
    def forward(self, x):
        return self.model(x)

In [80]:
def make_baseline_model(embedding_dim, mlp_dim):
    return nn.Sequential(
        PreprocessingModule(tokenizer, vocab),
        EmbedItemModule(tokenizer, vocab, embedding_dim).cuda(),
        BaselineEncoder().cuda(),
        MLP(2*embedding_dim, mlp_dim, len(categories)).cuda(),
    )

embedding_dim = 100
model = make_baseline_model(embedding_dim, 200)

for batch, targ in train_loader:
    print(model(batch).shape)
    # print(targ)
    break

torch.Size([64, 42])


In [57]:
# now the training and evaluation loops
def evaluate(model, loader):
    model.eval()
    correct, total = 0., 0.
    bs = loader.batch_size
    for batch, targ in loader:
        # the tensors get moved to cuda at the end of the Preprocessing module
        # note to self: next time I should preprocess the dataset once in advance instead of doing it every time on the fly
        with torch.no_grad():
            logits = model(batch)
        preds = logits.detach().argmax(axis=1)
        correct += (preds == targ.cuda()).sum()
        total += bs
    return correct / total     

In [58]:
# import time
# start = time.time()
# evaluate(model, val_loader)
# end = time.time()
# print(end - start)

In [59]:
1/42

0.023809523809523808

In [60]:
len(ds_val)

41906

In [95]:
# training 
def train_epoch(model, loader, optimizer, loss_module):
    model.train()
    for batch, targ in loader:
        logits = model(batch)
        loss = loss_module(logits, targ.cuda())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
def train_model(model, train_loader, val_loader, optimizer, loss_module, epochs):
    val_accs = []
    for epoch in range(epochs):
        train_epoch(model, train_loader, optimizer, loss_module)
        val_acc = evaluate(model, val_loader)
        val_accs.append(val_acc)
        print(val_acc)
    return val_accs

def train_model_dev(model, train_loader, val_loader, optimizer, loss_module, epochs):
    val_accs = []
    for epoch in tqdm(range(epochs)):
        for i in range(1000):
            train_epoch(model, train_loader, optimizer, loss_module)
        val_acc = evaluate(model, val_loader)
        val_accs.append(val_acc)
        print(val_acc)
        

In [99]:
model = make_baseline_model(100,1000)
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
loss_module = nn.CrossEntropyLoss()

train_model_dev(model, micro_loader, micro_loader, optimizer, loss_module, 15)

  7%|▋         | 1/15 [00:20<04:53, 20.99s/it]

tensor(0.9062, device='cuda:0')


 13%|█▎        | 2/15 [00:43<04:41, 21.62s/it]

tensor(0.9219, device='cuda:0')


 20%|██        | 3/15 [01:03<04:11, 20.95s/it]

tensor(0.9375, device='cuda:0')


 27%|██▋       | 4/15 [01:25<03:55, 21.43s/it]

tensor(0.9375, device='cuda:0')


 33%|███▎      | 5/15 [01:47<03:36, 21.64s/it]

tensor(0.9688, device='cuda:0')


 40%|████      | 6/15 [02:07<03:11, 21.25s/it]

tensor(0.9688, device='cuda:0')


 47%|████▋     | 7/15 [02:28<02:49, 21.19s/it]

tensor(0.9688, device='cuda:0')


 53%|█████▎    | 8/15 [02:48<02:23, 20.56s/it]

tensor(0.9688, device='cuda:0')


 60%|██████    | 9/15 [03:08<02:02, 20.41s/it]

tensor(0.9688, device='cuda:0')


 67%|██████▋   | 10/15 [03:30<01:44, 21.00s/it]

tensor(1., device='cuda:0')


 73%|███████▎  | 11/15 [03:52<01:25, 21.30s/it]

tensor(1., device='cuda:0')


 80%|████████  | 12/15 [04:14<01:04, 21.54s/it]

tensor(1., device='cuda:0')


 87%|████████▋ | 13/15 [04:35<00:42, 21.33s/it]

tensor(1., device='cuda:0')


 93%|█████████▎| 14/15 [04:56<00:21, 21.26s/it]

tensor(1., device='cuda:0')


100%|██████████| 15/15 [05:17<00:00, 21.14s/it]

tensor(1., device='cuda:0')





### Conclusion 
I can overfit on one batch now, but it is way too slow. I'll try again, but this time making the dataloaders more efficient and see if it works better then.