# Text Summarization

In [1]:
import torch
import numpy as np

### PATH VARIABLES

In [2]:
DATA_PATH = "./data/news_summary.csv"
MODEL_PATH = "./model_checkpoints/"

### TUNABLE VARIABLES

In [3]:
NUM_WORDS = 25000
VOCAB_SIZE = NUM_WORDS + 4
EMBEDDING_DIM = 100
HIDDEN_DIM = 10

MAX_TEXT_LEN = 300
MAX_SUM_LEN = 60

BATCH_SIZE = 4
NUM_TEST = 300

DEVICE = 'cuda'

### LOAD DATA

In [4]:
from utils import get_data
from data import Dataset

In [5]:
data, w2i, i2w = get_data(DATA_PATH, NUM_WORDS)

Length of the data: 4514
Length of the data after dropping nan: 4396


In [19]:
dataset_train = Dataset(data[:-5], w2i, MAX_TEXT_LEN, MAX_SUM_LEN, isTrain=True)
dataset_test = Dataset(data[5:], w2i, MAX_TEXT_LEN, MAX_SUM_LEN, isTrain=True)

In [20]:
## Collate functions
def collate_fn_train(data):
    text, summary = zip(*data)

    text_lens = [len(i) for i in text]
    summary_lens = [len(i) for i in summary]

    text = torch.nn.utils.rnn.pad_sequence(text, batch_first=True, padding_value=w2i['<pad>'])
    summary = torch.nn.utils.rnn.pad_sequence(summary, batch_first=True, padding_value=w2i['<pad>'])

    text_lens = torch.from_numpy(np.asarray(text_lens))
    summary_lens = torch.from_numpy(np.asarray(summary_lens))

    return (text, text_lens), (summary, summary_lens)


def collate_fn_test(texts):
    text_lens = [len(i) for i in texts]
    texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=w2i['<pad>'])

    text_lens = torch.from_numpy(np.asarray(text_lens))
    return texts, text_lens

In [21]:
dataloader_train = torch.utils.data.DataLoader(dataset_train, BATCH_SIZE, num_workers=4, shuffle=True, collate_fn=collate_fn_train)
dataloader_test = torch.utils.data.DataLoader(dataset_test, BATCH_SIZE // 4, num_workers=4, shuffle=False, collate_fn=collate_fn_train)

### INIT MODEL

In [22]:
from model import Seq2Seq

In [23]:
model = Seq2Seq(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM).to(DEVICE)

In [24]:
model

Seq2Seq(
  (embedding): Embedding(
    (embedding): Embedding(25004, 100)
  )
  (encoder): Encoder(
    (encoder_network): LSTM(100, 10, num_layers=2, batch_first=True)
  )
  (decoder): Decoder(
    (decoder_network): LSTM(100, 10, num_layers=2, batch_first=True)
  )
  (project): Projection(
    (projection_layer): Linear(in_features=20, out_features=25004, bias=True)
  )
  (attention): Attention()
)

## Training Utils

In [25]:
import torch.nn as nn

In [26]:
def train(model, n_epochs, train_loader, valid_loader):
    teacher_force = 0.3
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6)
    criterion = nn.NLLLoss(reduction='none')
    for ep in range(n_epochs):
        train_batch(model, ep + 1, train_loader, optimizer, teacher_force, criterion)
        torch.cuda.empty_cache()
        valid_batch(model, valid_loader, criterion)

        # Save model every 10 epochs
        if (ep + 1) % 10 == 0:
            torch.save(model.state_dict(), MODEL_PATH + "m1_e{}.model".format(ep))
        
        # Reduce LR every 40 epochs
        if (ep + 1) % 10 == 0:
            for p in optimizer.param_groups:
                p['lr'] *= 0.5
        
        # Reduce teacher forcing by 10% every 20 epochs
        if (ep + 1) % 10 == 0:
            if teacher_force >= 50:
                teacher_force -= 0.1


def train_batch(model, n_epoch, dataloader, optimizer, teacher_force, criterion):
    model.train()
    total_loss = 0.
    for batch_num, ((x, xlens), (y, ylens)) in enumerate(dataloader):

        # setup tensors
        x = x.long().to(DEVICE)
        y = y.long().to(DEVICE)

        # clear previous gradients
        optimizer.zero_grad()

        # generate predictions
        # output: (BATCH_SIZE, time_steps, NUM_WORDS)
        output = model(x, xlens, y, teacher_force_prob=teacher_force)

        ### Calculate Loss
        # 1. y must be shifted by 1 for loss calc. since outputs should not not contain <sos>
        y_true = torch.cat([y[:, 1:], torch.ones((y.size(0), 1)).long().to(DEVICE) * w2i["<pad>"]], dim=-1)
        # 2. Ouput shape for loss calculation must be of the form (BATCH_SIZE, NUM_WORDS, *)
        # Refer pytorch docs for more details
        loss = criterion(output.permute(0, 2, 1), y_true)

        # 3. Mask the loss. Needed since we have padding which is not needed
        # Can avoid if using pack_padded sequence?
        num_tokens = 0
        for i, yl in enumerate(ylens):
            loss[i, yl-1:] *= 0 # yl-1 to remove <sos>
            num_tokens += yl - 1

        # 4. SUM the losses then divide by number of tokens and finally call backward
        loss = loss.sum() / num_tokens
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 2)

        # Adjust parameters
        optimizer.step()
        total_loss += loss.item()

    # Display training stats
    print("EP: {} Loss: {}".format(n_epoch, total_loss / len(dataloader)))


def valid_batch(model, dataloader, criterion):
    total_loss = 0.
    model.eval()
    for batch_num, ((x, xlens), (y, ylens)) in enumerate(dataloader):

        # setup tensors
        x = x.long().to(DEVICE)
        y = y.long().to(DEVICE)

        # generate predictions
        # output: (BATCH_SIZE, time_steps, NUM_WORDS)
        output = model(x, xlens, y, teacher_force_prob=0)

        ### Calculate Loss
        # 1. y must be shifted by 1 for loss calc. since outputs should not not contain <sos>
        y_true = torch.cat([y[:, 1:], torch.ones((y.size(0), 1)).long().to(DEVICE) * w2i["<pad>"]], dim=-1)
        # 2. Ouput shape for loss calculation must be of the form (BATCH_SIZE, NUM_WORDS, *)
        # Refer pytorch docs for more details
        loss = criterion(output.permute(0, 2, 1), y_true)

        # 3. Mask the loss. Needed since we have padding which is not needed
        # Can avoid if using pack_padded sequence?
        num_tokens = 0
        for i, yl in enumerate(ylens):
            loss[i, yl-1:] *= 0 # yl-1 to remove <sos>
            num_tokens += yl - 1

        # 4. SUM the losses then divide by number of tokens and finally call backward
        loss = loss.sum() / num_tokens

        #Add loss
        total_loss += loss.item()

    # Display test stats
    print("Test Loss: {}".format(total_loss / len(dataloader)))

### Train the Model

In [27]:
train(model, 200, dataloader_train, dataloader_test)

EP: 1 Loss: 10.102949142456055
Test Loss: 10.096566772460937
EP: 2 Loss: 10.093341827392578
Test Loss: 10.091037178039551
EP: 3 Loss: 10.065802574157715
Test Loss: 10.085530471801757
EP: 4 Loss: 10.06158447265625
Test Loss: 10.079562187194824
EP: 5 Loss: 10.056870937347412
Test Loss: 10.073042488098144
EP: 6 Loss: 10.04487943649292
Test Loss: 10.065895462036133
EP: 7 Loss: 10.025747776031494
Test Loss: 10.059509086608887
EP: 8 Loss: 10.010315895080566
Test Loss: 10.053603363037109
EP: 9 Loss: 9.99931287765503
Test Loss: 10.047212600708008
EP: 10 Loss: 9.999106407165527
Test Loss: 10.042146492004395
EP: 11 Loss: 9.98165512084961
Test Loss: 10.0408145904541
EP: 12 Loss: 9.98488712310791
Test Loss: 10.037449836730957
EP: 13 Loss: 9.961812496185303
Test Loss: 10.033622741699219
EP: 14 Loss: 9.952765941619873
Test Loss: 10.030361938476563
EP: 15 Loss: 9.945680141448975
Test Loss: 10.026434898376465
EP: 16 Loss: 9.952454090118408
Test Loss: 10.02233829498291
EP: 17 Loss: 9.944035530090332
Te

KeyboardInterrupt: 

## Eval

In [None]:
def eval(model, dataloader):
    model.eval()
    for (x, xlens), (y, ylens) in enumerate(dataloader):
        raise NotImplemented("TODO")