In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import json
import torch
import numpy as np
from tqdm import tqdm
from torch import nn, optim
from torch.utils.data import DataLoader

from load_data import Tokenizer, GenderDataset, gender_data_collate_fn
from models.classifier_lstm import ClassifierLSTM

In [3]:
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

print(device)

cuda


In [4]:
classifier_embedding_size = 512
classifier_hidden_size = 512
classifier_num_layers = 2
classifier_is_bidirectional = True

classifier_lr = 0.001
classifier_num_epoch = 10
classifier_batch_size = 128
classifier_max_norm = 2

print_every = 200

In [5]:
with open(os.path.join(os.curdir, "data", "blog.json"), "r") as file:
    json_data = json.load(file)
docs = json_data['docs'][1:] # I don't want to see the first document

In [6]:
tokenizer = Tokenizer(docs)

Cutting documents into paragraphs of length 128...


100%|██████████| 19676/19676 [00:33<00:00, 579.47it/s]


Number of documents: 559126
Counting freqeuncies of words...


100%|██████████| 559126/559126 [00:16<00:00, 34176.40it/s]


Number of documents with lengths <= 128: 554016
Number of unique words before converting to <UNK>:  505954
Converting words with frequencies less than 10 to <UNK>...


554016it [00:11, 46524.85it/s]

Number of unique words after converting <UNK>:  59178
Known occurrences rate 98.69%





In [7]:
num_docs = len(docs)
num_train_docs = int(num_docs * 0.7)
num_val_docs = int(num_docs * 0.15)
num_test_docs = num_docs - num_train_docs - num_val_docs
print(num_train_docs, num_val_docs, num_test_docs)

13773 2951 2952


In [8]:
train_docs = docs[:num_train_docs]
val_docs = docs[num_train_docs:num_train_docs+num_val_docs]
test_docs = docs[num_train_docs+num_val_docs:]

In [9]:
train_dataset = GenderDataset(train_docs, tokenizer)
val_dataset = GenderDataset(val_docs, tokenizer)
test_dataset = GenderDataset(test_docs, tokenizer)

Cutting documents into paragraphs of length 128...


100%|██████████| 13773/13773 [00:24<00:00, 561.64it/s]


Number of documents: 405965
Counting freqeuncies of words...


100%|██████████| 405965/405965 [00:11<00:00, 35868.70it/s]


Number of documents with lengths <= 128: 402316
Cutting documents into paragraphs of length 128...


100%|██████████| 2951/2951 [00:05<00:00, 566.19it/s]


Number of documents: 87422
Counting freqeuncies of words...


100%|██████████| 87422/87422 [00:02<00:00, 35250.84it/s]


Number of documents with lengths <= 128: 86653
Cutting documents into paragraphs of length 128...


100%|██████████| 2952/2952 [00:03<00:00, 764.57it/s]


Number of documents: 65739
Counting freqeuncies of words...


100%|██████████| 65739/65739 [00:02<00:00, 32447.69it/s]


Number of documents with lengths <= 128: 65047


In [10]:
print(train_dataset[1], val_dataset[1], test_dataset[1])

([21627, 42637, 52748, 48438, 2086, 10627, 49101, 31020, 6536, 43359, 40539, 10627, 11637, 31002, 3856, 42637, 49443, 10627, 50505, 2086, 29356, 3856, 29356, 58450, 39587, 16406, 29356, 52375, 55529, 27932, 29356, 33744, 17357, 3856, 14829, 3856, 45206, 42637, 25002, 37089, 43603, 52748, 15624, 3241, 10627, 26430, 2086, 10627, 37089, 4229, 3856, 42637, 25827, 47185, 31287, 52748, 10627, 42341, 9134, 3241, 38470, 47731, 3856, 27932, 46076, 52748, 26462, 33744, 58183, 33744, 21761, 49127, 42637, 5759, 27932, 42130, 47185, 1834, 21761, 20820, 3856, 40539, 40817, 52375, 10627, 9098, 47221, 40539, 58406, 45187, 58177, 57879, 45961, 42334, 27034, 53332, 27932, 48862, 3856, 3407], 0) ([21627, 51063, 46504, 16364, 12695, 14692, 7751, 50794, 52375, 30563, 37179, 29217, 3484, 13030, 54486, 3241, 16331, 9673, 22875, 52375, 27932, 51063, 8836, 27034, 24757, 35033, 42637, 3241, 58521, 39168, 3856, 11225, 51063, 11676, 35401, 3637, 40539, 10627, 25385, 33123, 40714, 45613, 49106, 30563, 28629, 3802,

In [34]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=classifier_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=classifier_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=classifier_batch_size,
    shuffle=True,
    num_workers=0,
    collate_fn=gender_data_collate_fn
)

In [31]:
classifier_model = ClassifierLSTM(
    tokenizer.vocab_size(), 
    classifier_embedding_size, 
    classifier_hidden_size, 
    classifier_num_layers, 
    classifier_is_bidirectional
).to(device)

In [32]:
optimizer = optim.Adam(classifier_model.parameters(), lr = classifier_lr)

In [33]:
criterion = nn.CrossEntropyLoss()

In [39]:
def train(train_dataloader, val_dataloader, model, criterion, optimizer, num_epoch):

    for epoch in range(num_epoch):
        print(f"Epoch {epoch}, total {len(train_dataloader)} batches\n")
        model.train()
        optimizer.zero_grad()

        for batch, (src_ids, src_len, tgt) in enumerate(train_dataloader):

            src_ids = src_ids.to(device)
            tgt = tgt.to(device)

            logits = model(src_ids, src_len)
            loss = criterion(logits, tgt)
            if batch % print_every == 0:
                probs = nn.functional.softmax(logits)
                print(f"Epoch Step: {batch} Loss: {loss} Acc: {np.mean([probs.cpu().detach().numpy()[i][tgt[i]] for i in range(logits.size(0))])}")

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), classifier_max_norm)
            optimizer.step()
            optimizer.zero_grad()

        print("\nBegin Evaluation")
        model.eval()
        total_acc = 0
        with torch.no_grad():
            for batch, (src_ids, src_len, tgt) in enumerate(val_dataloader):
                src_ids = src_ids.to(device)
                tgt = tgt.to(device)
                logits = model(src_ids, src_len)
                probs = nn.functional.softmax(logits)
                total_acc += np.sum([probs.cpu().detach().numpy()[i][tgt[i]] for i in range(logits.size(0))])
        
        acc = total_acc / len(val_dataloader.dataset)
        print(f"Validation Accuracy: {acc}, model saved\n")

        torch.save(model.state_dict(), f'./save/classifier_model_{classifier_hidden_size}_{classifier_batch_size}_{classifier_num_layers}_{classifier_is_bidirectional}_epoch_{epoch}.file')


In [38]:
train(train_dataloader, val_dataloader, classifier_model, criterion, optimizer, classifier_num_epoch)

Epoch 0, total 3144 batches



  probs = nn.functional.softmax(logits)


Epoch Step: 0 Loss: 0.6955457329750061 Acc: 0.4992351233959198


KeyboardInterrupt: 