In [1]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [1]:
import csv
import gzip
from itertools import chain
import logging
import json
from pathlib import Path
from typing import Dict, Optional

from seqeval.metrics import classification_report, accuracy_score, f1_score
from seqeval.scheme import IOB2
import spacy
from spacy.lang.en import English
import numpy as np
import pickle
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint
import torch
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

In [2]:
torch.manual_seed(420)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [3]:
def build_glove(fpath: str, save=True, save_path='../../data/glove.840B.300d.gz') -> Dict[str, np.array]:
    logger.info("Loading Glove embeddings...")
    glove = {}
    with open(fpath) as f:
        for line in f:
            values = line.split()
            word = ''.join(values[:-300])
            vector = np.array([float(v) for v in values[-300:]])
            glove[word] = vector
            
    logger.info("GloVe embeddings loaded.")
    if save:
        logger.info("Saving GloVe to disk.")
        with gzip.open(save_path, 'wb') as f:
            pickle.dump(glove, f)
        logger.info("Save complete.")
    return glove

def build_mapping(fpath: str, key: str, save_path: str):
    with open(fpath) as f:
        data = json.load(f)
    
#     labels = list(set([*i[key] if isinstance(i[key], list) else i[key] for i in data]))
    labels = []
    for sample in data:
        label = sample[key]
        if isinstance(label, list):
            labels.extend(label)
        else:
            labels.append(label)
    labels = list(set(labels))
    labels_to_idx = {label: idx for idx, label in enumerate(labels)}  
    
    if save_path:
        with open(save_path, 'w') as f:
            json.dump(labels_to_idx, f, ensure_ascii=False, indent=4)
    
    return labels_to_idx

    
def write_preds_to_csv(task: str, ids: list, preds: list, fname: str = "intent_preds.csv"):
    fpath = "../ADL21-HW1/data/intent/" + fname
    if task == 'tagging':
        header = ['id', 'tags']
        out = {}
        for _id, pred in zip(ids, preds):
            out[_id] = out.get(_id, list())
            out[_id].append(pred)
        out = {key: " ".join(val) for key, val in out.items()}
            
        
    elif task == 'intent':
        header = ['id', 'intent']
        
    with open(fpath, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(header)
        if task == 'tagging':
            writer.writerows(out.items())
        elif task == 'intent':
            writer.writerows(zip(ids, preds))
    logger.info(f"Intent predictions written to {fpath}")

In [229]:
nlp = spacy.load('en_core_web_md')

In [231]:
nlp.get_pipe('tagger').labels

('$',
 "''",
 ',',
 '-LRB-',
 '-RRB-',
 '.',
 ':',
 'ADD',
 'AFX',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'HYPH',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NFP',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 'XX',
 '``')

In [7]:
# glove = build_glove('../../data/glove.840B.300d.txt')

2021-04-14 21:35:34,953 - __main__ - INFO - Loading Glove embeddings...
2021-04-14 21:38:30,555 - __main__ - INFO - GloVe embeddings loaded.
2021-04-14 21:38:30,556 - __main__ - INFO - Saving GloVe to disk.
2021-04-14 21:42:19,358 - __main__ - INFO - Save complete.


In [9]:
# build_intent_mappings('../ADL21-HW1/data/intent/train.json', save=True)

# Intent Classification

## Dataset

In [4]:
class IntentDataset(Dataset):
    def __init__(self, data_path: str, train: bool, device: str, intent_mapping: Dict[str, int], glove: Optional[Dict[str, np.array]] = None, glove_path: str = "../../data/glove.840B.300d.pkl.gz", unk_token_strategy='ignore'):
        with open(data_path) as f: 
            self.data = json.load(f)
        self.intent_to_idx = intent_mapping
        self.device = device
        self.train = train
        self.glove = glove
        self.tokenizer = English().tokenizer
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        _id = sample['id']
        text = sample['text']
        text = self.convert_to_vectors(text)
        length = len(text)
        out = {
            'id': _id,
            'text': text,
            'length': length
        }
        if self.train:
            intent = sample['intent']
            intent = self.intent_to_idx[intent]
            out['intent'] = intent
        return out
        
    def convert_to_vectors(self, text):
        vectors = []
        if self.unk_token_strategy == 'ignore':
            for idx, tok in enumerate(self.tokenizer(text)):
                try:
#                     vector = self.glove[tok.text].to(self.device)
                    vector = torch.from_numpy(self.glove[tok.text]).float()
                except KeyError:
                    continue
                else:
                    vectors.append(vector)
        return torch.stack(vectors)
            
        
class IntentDataModule(pl.LightningDataModule):
    def __init__(self, device: str, data_dir: str = "../ADL21-HW1/data/intent", intent_mapping: str = "../data/intents_to_idx.json", embedding_obj: Optional[Dict[str, np.array]] = None, embedding_dir: str = "../../data/glove.840B.300d.gz", batch_size: int = 32):
        super().__init__()
        self.device = device
        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        with open(intent_mapping) as f:
            self.intent_to_idx = json.load(f)
        if embedding_obj:
            self.emb = embedding_obj
        else:
            self.emb = self._load_glove(embedding_dir)
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            self.intent_train = IntentDataset(
                device=self.device,
                data_path=self.data_dir.joinpath('train.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
            self.intent_val = IntentDataset(
                device=self.device,
                data_path=self.data_dir.joinpath('eval.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
        elif stage == "test" or stage is None:
            self.intent_test = IntentDataset(
                device=self.device,
                data_path=self.data_dir.joinpath('test.json'), 
                train=False,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
        
    def train_dataloader(self):
        return DataLoader(self.intent_train, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(False), shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.intent_val, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(False))

    def test_dataloader(self):
        return DataLoader(self.intent_test, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(True))
        
    @staticmethod
    def _collate_fn(is_test):
        def collate_fn(batch):
            out = {}
            _id = [b['id'] for b in batch]
            text = [b['text'] for b in batch]
            length = torch.LongTensor([b['length'] for b in batch])
            text = pad_sequence(text, batch_first=True)
            if not is_test:
                intent = torch.LongTensor([b['intent'] for b in batch])
                out['intent'] = intent

            out['id'] = _id
            out['text'] = text
            out['length'] = length
            out['text'] = text
            return out
        return collate_fn
        
        
    @staticmethod
    def _load_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
        logger.info("Loading GloVe embeddings...")
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Done!")
        return emb
        

## Model

In [5]:
class IntentClassifier(pl.LightningModule):
    def __init__(self, num_labels: int, hidden_size: int = 128, num_layers: int = 3, bidirectional: bool = True, lr: int = 1e-4, dropout=0):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = 2 if bidirectional else 1
        self.lr = lr
        self.rnn = nn.GRU(
            input_size=300, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            batch_first=True,
            dropout=dropout,
        )
        self.hidden_to_labels = nn.Linear(self.hidden_size * self.bidirectional, num_labels)
        self.save_hyperparameters()
        self.test_preds = {
            'ids': [],
            'logits': []
        }
        
    def forward(self, inpt):
        samples = inpt['text']
        lengths = inpt['length'].to('cpu')
        batch_size = samples.shape[0]
        samples = pack_padded_sequence(samples, lengths, batch_first=True, enforce_sorted=False)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(samples, hidden)
        hidden = torch.cat([hidden[-1,...], hidden[-2,...]], dim=1)  # concat last hidden states of forwards and backwards
        logits = self.hidden_to_labels(hidden)
        return logits
        
    def _shared_step(self, batch):
        ids = batch['id']
        intent = batch['intent']
        logits = self(batch)
        loss = F.nll_loss(F.log_softmax(logits, dim=1), intent)
        return loss
        
    def training_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('training_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
        
    def test_step(self, batch, batch_idx):
        ids = batch['id']
        logits = self(batch)
        self.test_preds['ids'].extend(ids)
        self.test_preds['logits'].extend(logits)
        
    
    def process_logits(self, logits, idx2int):
        preds = torch.stack(logits)
        preds = preds.argmax(dim=1).tolist()
        preds = [idx2int[p] for p in preds]
        return preds
            
    def init_hidden(self, batch_size):
        return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cuda')
        
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)

# Model training

In [6]:
# glove = IntentDataModule._load_glove('../../data/glove.840B.300d.gz')

NameError: name 'IntentDataModule' is not defined

In [9]:
intent_dm = IntentDataModule(device=device, embedding_obj=glove)
intent_labels = intent_dm.intent_to_idx

In [285]:
intent_dm.prepare_data()
intent_dm.setup('fit')

In [286]:
sample = next(iter(intent_dm.intent_train))

In [287]:
sample

{'id': 'train-0',
 'text': tensor([[ 0.1873,  0.4060, -0.5117,  ...,  0.1649,  0.1876,  0.5387],
         [ 0.1206,  0.1426, -0.1558,  ..., -0.3866,  0.0566,  0.0155],
         [-0.1108,  0.3079, -0.5198,  ..., -0.0591,  0.4760,  0.0566],
         ...,
         [-0.2323,  0.4963,  0.3955,  ..., -0.3698, -0.2552,  0.2159],
         [-0.0702,  0.1527, -0.3309,  ..., -0.1373,  0.1575,  0.6155],
         [ 0.2123,  0.1944,  0.7883,  ...,  0.1774, -0.7119, -0.3592]]),
 'length': 15,
 'intent': 14}

In [63]:
model = IntentClassifier(num_labels=len(intent_labels), hidden_size=256, dropout=0.5)
# model = IntentClassifier.load_from_checkpoint("./lightning_logs/version_67/checkpoints/epoch=17-step=8441.ckpt")

In [64]:
trainer = pl.Trainer(
#     auto_lr_find=True,
    gpus=1,
    gradient_clip_val=1,
    weights_summary='full',
#     track_grad_norm=2,
    callbacks=[EarlyStopping(monitor='val_loss')],
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [65]:
trainer.fit(model, datamodule=intent_dm)


  | Name             | Type   | Params
--------------------------------------------
0 | rnn              | GRU    | 3.2 M 
1 | hidden_to_labels | Linear | 77.0 K
--------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.198    Total estimated model params size (MB)


Epoch 0:  83%|████████▎ | 470/563 [00:14<00:02, 33.17it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/94 [00:00<?, ?it/s][A
Epoch 0:  85%|████████▍ | 476/563 [00:15<00:02, 31.49it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Epoch 0:  86%|████████▌ | 484/563 [00:15<00:02, 31.79it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Epoch 0:  88%|████████▊ | 496/563 [00:15<00:02, 32.36it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Epoch 0:  90%|█████████ | 508/563 [00:15<00:01, 32.90it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Validating:  43%|████▎     | 40/94 [00:01<00:00, 54.98it/s][A
Epoch 0:  92%|█████████▏| 520/563 [00:15<00:01, 33.32it/s, loss=4.66, v_num=70, val_loss_epoch=4.990, training_loss_step=4.590]
Epoch 0:  94%|█████████▍| 532/563 [00:15<00:00, 33.72it/s, loss=4.66, v_num=70, v

1

In [66]:
model = IntentClassifier.load_from_checkpoint('./lightning_logs/version_70/checkpoints/epoch=17-step=8441.ckpt')

In [67]:
trainer.test(model, datamodule=intent_dm)
test_preds = model.test_preds

Testing: 100%|██████████| 141/141 [00:02<00:00, 54.53it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


In [110]:
# https://stackoverflow.com/a/50517921
TAG_MAP = [
    ".",        
    ",",        
    "-LRB-",    
    "-RRB-",    
    "``",       
    "\"\"",     
    "''",       
    ",",        
    "$",        
    "#",        
    "AFX",      
    "CC",       
    "CD",       
    "DT",       
    "EX",       
    "FW",       
    "HYPH",     
    "IN",       
    "JJ",       
    "JJR",      
    "JJS",      
    "LS",       
    "MD",       
    "NIL",      
    "NN",       
    "NNP",      
    "NNPS",     
    "NNS",   
    "PDT",   
    "POS",   
    "PRP",   
    "PRP$",  
    "RB",    
    "RBR",   
    "RBS",   
    "RP",    
    "SP",    
    "SYM",   
    "TO",    
    "UH",    
    "VB",    
    "VBD",  
    "VBG",  
    "VBN",  
    "VBP",  
    "VBZ",  
    "WDT",  
    "WP",   
    "WP$",  
    "WRB",  
    "ADD",  
    "NFP",   
    "GW",    
    "XX",    
    "BES",   
    "HVS",   
    "_SP",   
]
others = list(spacy.load('en_core_web_md').pipeline[1][1].labels)
combined = list(set(TAG_MAP + others))
with open('../data/pos_to_idx.json', 'w') as f:
    tags = {t: idx for idx, t in enumerate(combined)}
    json.dump(tags, f)

## Dataset Multitask

In [4]:
class IntentPosDataset(Dataset):
    def __init__(self, data_path: str, train: bool, intent_mapping: Dict[str, int], pos_map_path: str = "../data/pos_to_idx.json", glove: Optional[Dict[str, np.array]] = None, glove_path: str = "../../data/glove.840B.300d.pkl.gz", unk_token_strategy='average'):
        with open(data_path) as f: 
            self.data = json.load(f)
        self.intent_to_idx = intent_mapping
        self.train = train
        self.glove = glove
        self.nlp = spacy.load('en_core_web_md')
        with open(pos_map_path) as f:
            self.tag_to_idx = json.load(f)
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        _id = sample['id']
        text = sample['text']
        doc = self.nlp(text)
        toks = [d.text for d in doc]
        text = self.convert_to_vectors(toks, _id)
        length = len(text)
        out = {
            'id': _id,
            'text': text,
            'length': length
        }
        if self.train:
            intent = sample['intent']
            intent = self.intent_to_idx[intent]
            tags = [d.tag_ for d in doc]
            tags = [self.tag_to_idx[t] for t in tags]
            
            out['tags'] = tags
            out['intent'] = intent
        return out
        
    def convert_to_vectors(self, text, _id):
        vectors = []
        missing_idx = []
        
        for idx, tok in enumerate(text):
            try:
                vector = torch.from_numpy(self.glove[tok]).float()
            except KeyError:
#                 avg = torch.mean(torch.stack(vectors), axis=0)
                missing_idx.append(idx)
                vectors.append(torch.zeros(300))
#                 vectors.append(avg)
                continue
            else:
                vectors.append(vector)
                
        if len(vectors) == len(missing_idx):
            return torch.stack(vectors)
        
        if self.unk_token_strategy == 'ignore':
            return torch.stack(vectors)
        
        elif self.unk_token_strategy == 'average':
            if missing_idx:
                vectors = self._average_tokens(vectors, missing_idx)
                
        vectors = torch.stack(vectors)
        if torch.isnan(vectors).sum() > 0:
            print('NaN in embeddings!')
            print(_id)
            raise Exception
                
        return vectors
    
    @staticmethod
    def _average_tokens(vectors: list, missing_idxs: list, window: int = 2):
        for m in missing_idxs:
            avg = vectors[max(m-window, 0): m] + vectors[m + 1: m+1+window]
            if not avg:
                avg = torch.stack(vectors)
            else:
                avg = torch.stack(avg)
            if avg.sum() == 0:
                vectors[m] = torch.zeros(300)
                continue
            avg = avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1])
            avg = torch.mean(avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1]), axis=0)
            vectors[m] = avg
        
        return vectors
            
        
class IntentPosDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "../ADL21-HW1/data/intent", intent_mapping: str = "../data/intents_to_idx.json", embedding_obj: Optional[Dict[str, np.array]] = None, embedding_dir: str = "../../data/glove.840B.300d.gz", batch_size: int = 32):
        super().__init__()
        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        with open(intent_mapping) as f:
            self.intent_to_idx = json.load(f)
        if embedding_obj:
            self.emb = embedding_obj
        else:
            self.emb = self._load_glove(embedding_dir)
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            self.intent_train = IntentPosDataset(
                data_path=self.data_dir.joinpath('train.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
            self.intent_val = IntentPosDataset(
                data_path=self.data_dir.joinpath('eval.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
            self.tag_to_idx = self.intent_train.tag_to_idx
        elif stage == "test" or stage is None:
            self.intent_test = IntentPosDataset(
                data_path=self.data_dir.joinpath('test.json'), 
                train=False,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
        
    def train_dataloader(self):
        return DataLoader(self.intent_train, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(False), shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.intent_val, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(False))

    def test_dataloader(self):
        return DataLoader(self.intent_test, batch_size=self.batch_size, num_workers=8, pin_memory=True, collate_fn=self._collate_fn(True))
        
    @staticmethod
    def _collate_fn(is_test):
        def collate_fn(batch):
            out = {}
            _id = [b['id'] for b in batch]
            text = [b['text'] for b in batch]
            length = torch.LongTensor([b['length'] for b in batch])
            text = pad_sequence(text, batch_first=True)
            if not is_test:
                intent = torch.LongTensor([b['intent'] for b in batch])
                tags = [torch.LongTensor(b['tags']) for b in batch]
                tags = pad_sequence(tags, batch_first=True, padding_value=-1)
                out['intent'] = intent
                out['tags'] = tags

            out['id'] = _id
            out['text'] = text
            out['length'] = length
            out['text'] = text
            return out
        return collate_fn
        
        
    @staticmethod
    def _load_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
        logger.info("Loading GloVe embeddings...")
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Done!")
        return emb
        

## Model Multitask

In [22]:
class IntentPosClassifier(pl.LightningModule):
    def __init__(self, num_intent: int, num_tags: int, hidden_size: int = 512, num_layers: int = 3, bidirectional: bool = True, lr: int = 1e-4, dropout=0, loss_ratio=0.7, multitask=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.multitask = multitask
        self.loss_ratio = loss_ratio if multitask else 0.0
        self.bidirectional = 2 if bidirectional else 1
        self.lr = lr
        self.rnn = nn.GRU(
            input_size=300, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            batch_first=True,
            dropout=dropout,
        )
        self.hidden_to_labels = nn.Linear(self.hidden_size * self.bidirectional, num_intent)
        self.hidden_to_tags = nn.Linear(self.hidden_size * self.bidirectional, num_tags)
#         self.task_weight = nn.Linear(num_intent + num_tags, 2)
        self.dropout = nn.Dropout(dropout)
        self.save_hyperparameters()
        self.test_preds = {
            'ids': [],
            'logits': []
        }
        
    def forward(self, inpt):
        samples = inpt['text']
        tags = inpt.get('tags')
        lengths = inpt['length'].to('cpu')
        batch_size = samples.shape[0]
        
        samples = pack_padded_sequence(samples, lengths, batch_first=True, enforce_sorted=False)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(samples, hidden)
        hidden = self.dropout(hidden)
        hidden = torch.cat([hidden[-1,...], hidden[-2,...]], dim=1)  # concat last hidden states of forwards and backwards
        intent_logits = self.hidden_to_labels(hidden)
        if tags is not None: 
            out, out_len = pad_packed_sequence(out, batch_first=True)
            out = self.dropout(out)
            tag_logits = self.hidden_to_tags(out).permute(0, 2, 1)
            return intent_logits, tag_logits
            
        return intent_logits
        
    def _shared_step(self, batch):
        ids = batch['id']
        intent = batch['intent']
        tags = batch['tags']
        intent_logits, tag_logits = self(batch)
        intent_loss = F.cross_entropy(intent_logits, intent)
        tag_loss = F.cross_entropy(tag_logits, tags, ignore_index=-1)
        return intent_loss, tag_loss
        
    def training_step(self, batch, batch_idx):
        intent_loss, tag_loss = self._shared_step(batch)
        if self.multitask:
            intent_weight = self.loss_ratio
            tag_weight = 1.0 - self.loss_ratio
            loss = ((intent_loss * intent_weight) + (tag_loss * tag_weight)) 
        else:
            loss = intent_loss
        self.log('training_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        intent_loss, tag_loss = self._shared_step(batch)
        self.log('val_loss', intent_loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return intent_loss
        
    def test_step(self, batch, batch_idx):
        ids = batch['id']
        logits = self(batch)
        self.test_preds['ids'].extend(ids)
        self.test_preds['logits'].extend(logits)
        
    @staticmethod
    def process_logits(logits, idx2int):
        preds = torch.stack(logits)
        preds = preds.argmax(dim=1).tolist()
        preds = [idx2int[p] for p in preds]
        return preds
            
    def init_hidden(self, batch_size):
        return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cuda')
        
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)

## Intent Multitask Training

In [6]:
# glove = IntentPosDataModule._load_glove('../../data/glove.840B.300d.pkl.gz')
intent_pos_dm = IntentPosDataModule(embedding_obj=glove, batch_size=128)
intent_pos_dm.prepare_data()
intent_pos_dm.setup('fit')
intent_labels = intent_pos_dm.intent_to_idx
tag_labels = intent_pos_dm.tag_to_idx

2021-04-10 13:30:21,533 - __main__ - INFO - Loading GloVe embeddings...
2021-04-10 13:31:02,897 - __main__ - INFO - Done!


In [29]:
multitask = True
model = IntentPosClassifier(num_intent=len(intent_labels), num_layers=2, num_tags=len(tag_labels), hidden_size=2048, loss_ratio=0.7, dropout=.25, multitask=multitask)

In [30]:
if multitask:
    logging_dir = Path('.').joinpath('intent_mt_lightning_logs')
    filename = 'intent_mt-{epoch:02d}-{training_loss:.2f}-{val_loss:.2f}',
else:
    logging_dir = Path('.').joinpath('intent_lightning_logs')
    filename = 'intent-{epoch:02d}-{training_loss:.2f}-{val_loss:.2f}',
print(str(logging_dir.resolve()))

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename='intent_mt-{epoch:02d}-{training_loss:.2f}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)
trainer = pl.Trainer(
    
#     auto_lr_find=True,
    gpus=[0],
#     gradient_clip_val=1,
    weights_summary='full',
#     precision=16,
#     track_grad_norm=2,
    default_root_dir=str(logging_dir.resolve()),
    callbacks=[EarlyStopping(monitor='val_loss'), checkpoint_callback],
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


/home/dclian/adl/hw1/notebooks/intent_mt_lightning_logs


In [31]:
trainer.fit(model, datamodule=intent_pos_dm)


  | Name             | Type    | Params
---------------------------------------------
0 | rnn              | GRU     | 104 M 
1 | hidden_to_labels | Linear  | 614 K 
2 | hidden_to_tags   | Linear  | 233 K 
3 | dropout          | Dropout | 0     
---------------------------------------------
105 M     Trainable params
0         Non-trainable params
105 M     Total params
420.988   Total estimated model params size (MB)


Epoch 0:  83%|████████▎ | 469/563 [00:26<00:05, 17.43it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/94 [00:00<?, ?it/s][A
Epoch 0:  84%|████████▎ | 471/563 [00:28<00:05, 16.65it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Epoch 0:  84%|████████▍ | 475/563 [00:28<00:05, 16.72it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Epoch 0:  85%|████████▌ | 479/563 [00:28<00:05, 16.78it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Epoch 0:  86%|████████▌ | 483/563 [00:28<00:04, 16.84it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Epoch 0:  87%|████████▋ | 487/563 [00:28<00:04, 16.89it/s, loss=1.49, v_num=23, val_loss_epoch=5.010, training_loss_step=1.560]
Validating:  19%|█▉        | 18/94 [00:01<00:04, 17.52it/s][A
Epoch 0:  87%|████████▋ | 491/563 [00:29<00:04, 16.90it/s, loss=1.49, v_num=23, v

1

In [20]:
model = IntentPosClassifier.load_from_checkpoint(checkpoint_callback.best_model_path)
trainer.test(model, datamodule=intent_pos_dm)
test_preds = model.test_preds
idx_to_intent = {idx: intent for intent, idx in intent_labels.items()}
preds = model.process_logits(test_preds['logits'], idx_to_intent)
ids = test_preds['ids']

Testing: 100%|██████████| 141/141 [00:06<00:00, 22.78it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


In [21]:
write_preds_to_csv(task='intent', ids=ids, preds=preds, fname="intent_mt_preds_v20.csv")

2021-04-10 12:43:35,691 - __main__ - INFO - Intent predictions written to ../ADL21-HW1/data/intent/intent_mt_preds_v20.csv


In [36]:
preds = preds.argmax(dim=1)

In [40]:
preds = preds.tolist()

In [46]:
idx_to_intent = {idx: intent for intent, idx in intent_labels.items()}
preds = [idx_to_intent[p] for p in preds]

In [32]:
a = torch.LongTensor([1, 2, 3])
b = torch.LongTensor([4, 5, 6, 9, 10])
c = torch.LongTensor([7])
# a = torch.ones(10, 10)
# b = torch.ones(25, 10)
# a = torch.ones(9, 10)

In [56]:
a.device

device(type='cpu')

In [228]:
d = {
    1: 'a',
    2: 'b',
    **{3: 'c', 4: 'd'}
}
d

{1: 'a', 2: 'b', 3: 'c', 4: 'd'}

In [91]:
unpacked = [list() for _ in range(len(packed.unsorted_indices))]
left = 0
right = 0
for batch in packed.batch_sizes:
    right = left + batch
    window = packed.data[left: right]        
    for idx, item in enumerate(window):
        unpacked[idx].append(item)
    left = right
unpacked = [torch.stack(u) for u in unpacked]
print(unpacked)
order = packed.unsorted_indices.tolist()
unpacked = [unpacked[idx] for idx in order]
print(unpacked)

[tensor([ 4,  5,  6,  9, 10]), tensor([1, 2, 3]), tensor([7])]
[tensor([1, 2, 3]), tensor([ 4,  5,  6,  9, 10]), tensor([7])]


In [89]:
permute = [packed.unsorted_indices.tolist().index(i) for i in range(len(packed.unsorted_indices))]
print(permute)

[1, 0, 2]


In [90]:
unpacked = [unpacked[idx] for idx in permute]
print(unpacked)

[tensor([1, 2, 3]), tensor([ 4,  5,  6,  9, 10]), tensor([7])]


In [149]:
gru = nn.GRU(300, 5, batch_first=True, bidirectional=True)

In [153]:
out, hidden = gru(packed)

In [151]:
print(out.shape)
print(hidden.shape)

torch.Size([3, 8, 10])
torch.Size([2, 3, 5])


In [212]:
torch.cat([hidden[0,...], hidden[1,...]]).shape

torch.Size([6, 5])

In [207]:
hidden.view(3, -1).shape

torch.Size([3, 10])

In [152]:
print(out[0,-1,:])
print(hidden[0,0,:])

tensor([-0.3463, -0.0763,  0.2259,  0.1647, -0.2322, -0.0119,  0.1743, -0.0364,
         0.2349, -0.0128], grad_fn=<SliceBackward>)
tensor([-0.3463, -0.0763,  0.2259,  0.1647, -0.2322], grad_fn=<SliceBackward>)


In [161]:
pad_packed_sequence(out, batch_first=True)[0].shape

torch.Size([3, 8, 10])

In [130]:
y = torch.Tensor([1, 3, 4])

In [None]:
F.cross_entropy(hidden)

In [111]:
hidden.view(3, -1)

tensor([[-0.1592,  0.1634, -0.7492, -0.6678,  0.6098],
        [ 0.0978,  0.2586, -0.1639, -0.9648,  0.6199],
        [ 0.0724,  0.3100, -0.1748, -0.9578,  0.2774]], grad_fn=<ViewBackward>)

In [87]:
glove = IntentDataset._load_glove('../../data/glove.840B.300d.gz')

2021-03-21 02:06:47,328 - __main__ - INFO - Loading GloVe embeddings...
2021-03-21 02:08:37,989 - __main__ - INFO - Done!


In [104]:
ds = IntentDataset("../ADL21-HW1/data/intent/train.json", "../data/intents_to_idx.json", glove_obj=glove )

In [100]:
l = [torch.randn(3) for _ in range(5)]

In [107]:
s = next(iter(ds))

In [58]:
torch.normal(mean=0, std=1, size=(5, 25)).dtype

torch.float32

In [99]:
glove['jump'].shape

torch.Size([300])

In [47]:
g = build_glove('../../data/glove.840B.300d.txt')

2021-03-18 00:23:09,648 - __main__ - INFO - Loading Glove embeddings...
2021-03-18 00:25:02,845 - __main__ - INFO - Glove embeddings loaded.


In [50]:
with gzip.open('../../data/glove.840B.300d.gz', 'wb') as f:
    pickle.dump(g, f)

In [17]:
ft = fasttext.load_model('../../data/crawl-300d-2M-subword.bin')



In [12]:
fasttext.util.reduce_model(ft, 100)

<fasttext.FastText._FastText at 0x7f6c50bd7430>

In [13]:
ft.save_model('../../data/crawl-100d-2M-subword.bin')

In [14]:
ft = fasttext.load_model('../../data/crawl-100d-2M-subword.bin')



In [18]:
with open('../ADL21-HW1/data/intent/test.json') as f:
    train = json.load(f)
print(len(train))

4500


# Slot Tagging

## Dataset

In [4]:
class TaggingDataset(Dataset):
    def __init__(self, data_path: str, train: bool, mapping: Dict[str, int], glove: Optional[Dict[str, np.array]] = None, glove_path: str = "../../data/glove.840B.300d.gz", unk_token_strategy='average'):
        with open(data_path) as f: 
            self.data = json.load(f)
        self.tag_to_idx = mapping
        self.train = train
        self.glove = glove
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        tokens = sample['tokens']
        _id = sample['id']
        tokens = self.convert_to_vectors(tokens, _id)
        length = len(tokens)
        _id = [sample['id']] * length
        out = {
            'id': _id,
            'tokens': tokens,
            'length': length
        }
        if self.train:
            tags = sample['tags']
            tags = [self.tag_to_idx[t] for t in tags]
            out['tags'] = tags
            assert len(tags) == len(tokens)
        return out
        
    def convert_to_vectors(self, text, _id):
        vectors = []
        missing_idx = []
        
        for idx, tok in enumerate(text):
            try:
                vector = torch.from_numpy(self.glove[tok]).float()
            except KeyError:
#                 avg = torch.mean(torch.stack(vectors), axis=0)
                missing_idx.append(idx)
                vectors.append(torch.zeros(300))
#                 vectors.append(avg)
                continue
            else:
                vectors.append(vector)
                
        if len(vectors) == len(missing_idx):
            return torch.stack(vectors)
        
        if self.unk_token_strategy == 'ignore':
            return torch.stack(vectors)
        
        elif self.unk_token_strategy == 'average':
            if missing_idx:
                vectors = self._average_tokens(vectors, missing_idx)
                
        vectors = torch.stack(vectors)
        if torch.isnan(vectors).sum() > 0:
            print('NaN in embeddings!')
            print(_id)
            raise Exception
                
        return vectors
    
    @staticmethod
    def _average_tokens(vectors: list, missing_idxs: list, window: int = 2):
        for m in missing_idxs:
            avg = vectors[max(m-window, 0): m] + vectors[m + 1: m+1+window]
            if not avg:
                avg = torch.stack(vectors)
            else:
                avg = torch.stack(avg)
            if avg.sum() == 0:
                vectors[m] = torch.zeros(300)
                continue
            avg = avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1])
            avg = torch.mean(avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1]), axis=0)
            vectors[m] = avg
        
        return vectors
            
        
class TaggingDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "../ADL21-HW1/data/slot/", mapping: str = "../data/tags_to_idx.json", embedding_obj: Optional[Dict[str, np.array]] = None, embedding_dir: str = "../../data/glove.840B.300d.gz", batch_size: int = 32, pin_memory: bool = True):
        super().__init__()
        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        self.pin_memory = pin_memory
        with open(mapping) as f:
            self.tag_to_idx = json.load(f)
        if embedding_obj:
            self.emb = embedding_obj
        else:
            self.emb = self._load_glove(embedding_dir)
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            self.tag_train = TaggingDataset(
                data_path=self.data_dir.joinpath('train.json'), 
                train=True,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
            self.tag_val = TaggingDataset(
                data_path=self.data_dir.joinpath('eval.json'), 
                train=True,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
        elif stage == "test" or stage is None:
            self.tag_test = TaggingDataset(
                data_path=self.data_dir.joinpath('test.json'), 
                train=False,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
        
    def train_dataloader(self):
        return DataLoader(self.tag_train, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=False), shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.tag_val, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=False))

    def test_dataloader(self):
        return DataLoader(self.tag_test, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=True))
        
    @staticmethod
    def _collate_fn(is_test):
        def collate_fn(batch):
            out = {}
            _id = [b['id'] for b in batch]
            tokens = [b['tokens'] for b in batch]
            length = torch.LongTensor([b['length'] for b in batch])
            assert all(l == len(t) for l, t in zip(length, tokens))
            tokens = pad_sequence(tokens, batch_first=True)
            if not is_test:
                tags = [torch.LongTensor(b['tags']) for b in batch]
                out['tags'] = pad_sequence(tags, batch_first=True, padding_value=-1)
#                 tag_lengths = [len(t) for t in tags]
#                 out['tag_lengths'] = tag_lengths

            out['id'] = _id
            out['tokens'] = tokens
            out['length'] = length
            return out
        return collate_fn
        
        
    @staticmethod
    def _load_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
        logger.info("Loading GloVe embeddings...")
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Done!")
        return emb

## Model

In [5]:
class TaggingClassifier(pl.LightningModule):
    def __init__(self, num_labels: int, hidden_size: int = 128, num_layers: int = 3, bidirectional: bool = True, lr: int = 1e-5, dropout=0):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = 2 if bidirectional else 1
        self.lr = lr
        self.dropout_prob = dropout
        self.rnn = nn.GRU(
            input_size=300, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            batch_first=True,
            dropout=dropout,
        )
        self.hidden_to_labels = nn.Linear(self.hidden_size * self.bidirectional, num_labels)
        self.dropout = nn.Dropout(dropout)
        self.save_hyperparameters()
        self.test_preds = {
            'ids': [],
            'logits': []
        }
        self.val_preds = {
            'ids': [],
            'preds': []
        }
        
    def forward(self, inpt):
        samples = inpt['tokens'].to('cuda')
        if torch.isnan(samples).sum() > 0:
            print(f"NaN in samples!")
            raise Exception
        lengths = inpt['length'].to('cpu')
        batch_size = samples.shape[0]
        samples = pack_padded_sequence(samples, lengths, batch_first=True, enforce_sorted=False)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(samples, hidden)
        out, out_lens = pad_packed_sequence(out, batch_first=True)
#         out = out.view(-1, out.shape[-1])
        if torch.isnan(out).sum() > 0:
            print(f"NaN in out!")
            raise Exception
        logits = self.dropout(self.hidden_to_labels(out))
#         print(f'LENGTHS: {lengths}')
#         print(f'LOGITS: {logits.shape}')
        
#         assert lengths == logits.shape[1]
#         logits = logits.permute(0, 2, 1)  # must move classes to second dimension
        return logits
        
    def _shared_step(self, batch):
        ids = batch['id']
        tags = batch['tags']
        logits = self(batch)
        logits = logits.permute(0, 2, 1)
#         logger.info(f"TAGS: {tags.shape}")
#         logger.info(f"LOGITS: {logits.shape}")
#         logits = logits.reshape(-1, logits.shape[-1])
#         tags = tags.view(-1)
#         logger.info(f"TAGS: {tags.shape}")
#         logger.info(f"LOGITS: {logits.shape}")
        loss = F.cross_entropy(logits, tags, ignore_index=-1)
#         loss = F.nll_loss(F.log_softmax(logits, dim=1), tags, ignore_index=-1)
        return loss
        
    def training_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('training_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
        
    def test_step(self, batch, batch_idx):
        ids = batch['id']
        logits = self(batch)
        self.test_preds['ids'].extend(ids)
        self.test_preds['logits'].extend(logits)
        
    def process_logits(self, logits, idx2int):
        preds = torch.stack(logits)
        preds = preds.argmax(dim=1).tolist()
        preds = [idx2int[p] for p in preds]
        return preds
            
    def init_hidden(self, batch_size):
#         return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cpu')
        return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cuda')
        
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)

In [135]:
# build_mapping('../ADL21-HW1/data/slot/train.json', 'tags', '../data/tags_to_idx.json')

In [6]:
# glove = TaggingDataModule._load_glove('../../data/glove.840B.300d.gz')

2021-04-05 22:01:54,557 - __main__ - INFO - Loading GloVe embeddings...
2021-04-05 22:02:22,630 - __main__ - INFO - Done!


In [29]:
tag_dm = TaggingDataModule(embedding_obj=glove, batch_size=64)
tag_dm.prepare_data()
tag_dm.setup(stage='fit')
labels = tag_dm.tag_to_idx
model = TaggingClassifier(num_labels=len(labels), lr=1e-4, hidden_size=512, dropout=0, num_layers=8)

In [30]:
logging_dir = Path('.').joinpath('tagging_lightning_logs')
print(str(logging_dir.resolve()))

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
#     dirpath=logging_dir.resolve(),
    filename='tagging-{epoch:02d}-{training_loss:.2f}-{val_loss:.2f}',
    save_top_k=3,
    mode='min',
)
trainer = pl.Trainer(
    
#     auto_lr_find=True,
    gpus=1,
#     gradient_clip_val=1,
    weights_summary='full',
#     precision=16,
#     track_grad_norm=2,
    default_root_dir=str(logging_dir.resolve()),
    callbacks=[EarlyStopping(monitor='val_loss'), checkpoint_callback],
)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


/home/dclian/adl/hw1/notebooks/tagging_lightning_logs


In [31]:
trainer.fit(model, tag_dm)


  | Name             | Type    | Params
---------------------------------------------
0 | rnn              | GRU     | 35.6 M
1 | hidden_to_labels | Linear  | 9.2 K 
2 | dropout          | Dropout | 0     
---------------------------------------------
35.6 M    Trainable params
0         Non-trainable params
35.6 M    Total params
142.332   Total estimated model params size (MB)


Epoch 0:  88%|████████▊ | 115/130 [00:05<00:00, 20.77it/s, loss=0.818, v_num=36, val_loss_epoch=2.200, training_loss_step=0.756]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/16 [00:00<?, ?it/s][A
Epoch 0:  91%|█████████ | 118/130 [00:06<00:00, 19.41it/s, loss=0.818, v_num=36, val_loss_epoch=2.200, training_loss_step=0.756]
Epoch 0:  95%|█████████▌| 124/130 [00:06<00:00, 20.06it/s, loss=0.818, v_num=36, val_loss_epoch=2.200, training_loss_step=0.756]
Epoch 0: 100%|██████████| 130/130 [00:06<00:00, 20.27it/s, loss=0.818, v_num=36, val_loss_epoch=0.766, training_loss_step=0.746, training_loss_epoch=0.913, val_loss_step=0.829]
Epoch 1:  88%|████████▊ | 114/130 [00:05<00:00, 20.44it/s, loss=0.504, v_num=36, val_loss_epoch=0.766, training_loss_step=0.510, training_loss_epoch=0.913, val_loss_step=0.829]
Validating: 0it [00:00, ?it/s][A
Validating:   0%|          | 0/16 [00:00<?, ?it/s][A
Epoch 1:  92%|█████████▏| 120/130 [00:06<00:00, 19.45it/s, loss=0.504, v_num=36, va

1

## Tagging Multitask Dataset

In [3]:
class TaggingPosDataset(Dataset):
    def __init__(self, data_path: str, train: bool, mapping: Dict[str, int], pos_map_path: str = "../data/pos_to_idx.json", glove: Optional[Dict[str, np.array]] = None, glove_path: str = "../../data/glove.840B.300d.gz", unk_token_strategy='average'):
        with open(data_path) as f: 
            self.data = json.load(f)
        self.tag_to_idx = mapping
        with open(pos_map_path) as f:
            self.pos_to_idx = json.load(f) 
            self.pos_to_idx['UNK'] = len(self.pos_to_idx)
        self.train = train
        self.glove = glove
        self.nlp = spacy.load('en_core_web_md')
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        tokens = sample['tokens']
        tokens = [' ' if t == '' else t for t in tokens]
        _id = sample['id']
        try:
            doc = spacy.tokens.Doc(vocab=self.nlp.vocab, words=tokens)
        except Exception as e:
            print(e)
            print(f'ID: {_id}')
            print(tokens)
            raise
        tokens = self.convert_to_vectors(tokens, _id)
        length = len(tokens)
        _id = [sample['id']] * length
        out = {
            'id': _id,
            'tokens': tokens,
            'length': length
        }
        if self.train:
            UNK = self.pos_to_idx['UNK']
            tags = sample['tags']
            tags = [self.tag_to_idx[t] for t in tags]
            out['tags'] = tags
            pos = [d.tag_ for d in doc]
            out['pos'] = [self.pos_to_idx.get(p, UNK) for p in pos]
            assert len(pos) == len(tokens)
            assert len(tags) == len(tokens)
        return out
        
    def convert_to_vectors(self, text, _id):
        vectors = []
        missing_idx = []
        
        for idx, tok in enumerate(text):
            try:
                vector = torch.from_numpy(self.glove[tok]).float()
            except KeyError:
#                 avg = torch.mean(torch.stack(vectors), axis=0)
                missing_idx.append(idx)
                vectors.append(torch.zeros(300))
#                 vectors.append(avg)
                continue
            else:
                vectors.append(vector)
                
        if len(vectors) == len(missing_idx):
            return torch.stack(vectors)
        
        if self.unk_token_strategy == 'ignore':
            return torch.stack(vectors)
        
        elif self.unk_token_strategy == 'average':
            if missing_idx:
                vectors = self._average_tokens(vectors, missing_idx)
                
        vectors = torch.stack(vectors)
        if torch.isnan(vectors).sum() > 0:
            print('NaN in embeddings!')
            print(_id)
            raise Exception
                
        return vectors
    
    @staticmethod
    def _average_tokens(vectors: list, missing_idxs: list, window: int = 2):
        for m in missing_idxs:
            avg = vectors[max(m-window, 0): m] + vectors[m + 1: m+1+window]
            if not avg:
                avg = torch.stack(vectors)
            else:
                avg = torch.stack(avg)
            if avg.sum() == 0:
                vectors[m] = torch.zeros(300)
                continue
            avg = avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1])
            avg = torch.mean(avg[avg.nonzero(as_tuple=True)].view(-1, avg.shape[1]), axis=0)
            vectors[m] = avg
        
        return vectors
            
        
class TaggingPosDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "../ADL21-HW1/data/slot/", mapping: str = "../data/tags_to_idx.json", embedding_obj: Optional[Dict[str, np.array]] = None, embedding_dir: str = "../../data/glove.840B.300d.gz", batch_size: int = 32, pin_memory: bool = True):
        super().__init__()
        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        self.pin_memory = pin_memory
        with open(mapping) as f:
            self.tag_to_idx = json.load(f)
        if embedding_obj:
            self.emb = embedding_obj
        else:
            self.emb = self._load_glove(embedding_dir)
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            self.tag_train = TaggingPosDataset(
                data_path=self.data_dir.joinpath('train.json'), 
                train=True,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
            self.tag_val = TaggingPosDataset(
                data_path=self.data_dir.joinpath('eval.json'), 
                train=True,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
            self.pos_to_idx = self.tag_train.pos_to_idx
        elif stage == "test" or stage is None:
            self.tag_test = TaggingPosDataset(
                data_path=self.data_dir.joinpath('test.json'), 
                train=False,
                mapping=self.tag_to_idx, 
                glove=self.emb
            ) 
        
    def train_dataloader(self):
        return DataLoader(self.tag_train, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=False), shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.tag_val, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=False))

    def test_dataloader(self):
        return DataLoader(self.tag_test, batch_size=self.batch_size, num_workers=8, pin_memory=self.pin_memory, collate_fn=self._collate_fn(is_test=True))
        
    @staticmethod
    def _collate_fn(is_test):
        def collate_fn(batch):
            out = {}
            _id = [b['id'] for b in batch]
            tokens = [b['tokens'] for b in batch]
            length = torch.LongTensor([b['length'] for b in batch])
            assert all(l == len(t) for l, t in zip(length, tokens))
            tokens = pad_sequence(tokens, batch_first=True)
            if not is_test:
                tags = [torch.LongTensor(b['tags']) for b in batch]
                pos = [torch.LongTensor(b['pos']) for b in batch]
                out['tags'] = pad_sequence(tags, batch_first=True, padding_value=-1)
                out['pos'] = pad_sequence(pos, batch_first=True, padding_value=-1)

            out['id'] = _id
            out['tokens'] = tokens
            out['length'] = length
            return out
        return collate_fn
        
    @staticmethod
    def _load_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
        logger.info("Loading GloVe embeddings...")
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Done!")
        return emb

## Tagging Multitask

In [4]:
class TaggingPosClassifier(pl.LightningModule):
    def __init__(self, num_labels: int, num_pos: int, hidden_size: int = 128, num_layers: int = 3, bidirectional: bool = True, lr: int = 1e-4, dropout=0, loss_ratio=0.7, multitask=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.multitask = multitask
        self.loss_ratio = loss_ratio if multitask else 0.0
        self.bidirectional = 2 if bidirectional else 1
        self.lr = lr
        self.dropout_prob = dropout
        self.rnn = nn.GRU(
            input_size=300, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            batch_first=True,
            dropout=dropout,
        )
        self.hidden_to_labels = nn.Linear(self.hidden_size * self.bidirectional, num_labels)
        self.hidden_to_pos = nn.Linear(self.hidden_size * self.bidirectional, num_pos)
        self.dropout = nn.Dropout(dropout)
        self.save_hyperparameters()
        self.test_preds = {
            'ids': [],
            'logits': [],
            'lengths': [],
        }
        self.val_preds = {
            'ids': [],
            'preds': [],
            'lengths': [],
        }
        
    def forward(self, inpt):
#         samples = inpt['tokens'].to('cuda')
        samples = inpt['tokens']
        print(samples.shape)
        pos = inpt.get('pos')
        lengths = inpt['length'].to('cpu')
        batch_size = samples.shape[0]
        
        samples = pack_padded_sequence(samples, lengths, batch_first=True, enforce_sorted=False)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(samples, hidden)
        out, out_lens = pad_packed_sequence(out, batch_first=True)
        out = self.dropout(out)
#         out = out.view(-1, out.shape[-1])
        if torch.isnan(out).sum() > 0:
            print(f"NaN in out!")
            raise Exception
        tag_logits = self.hidden_to_labels(out)
        if pos is not None:
            pos_logits = self.hidden_to_pos(out)
            return tag_logits, pos_logits
        return tag_logits
        
#         print(f'LENGTHS: {lengths}')
#         print(f'LOGITS: {logits.shape}')
        
#         assert lengths == logits.shape[1]
#         logits = logits.permute(0, 2, 1)  
        
    def _shared_step(self, batch):
        ids = batch['id']
        tags = batch['tags']
        pos = batch['pos']
        tag_logits, pos_logits = self(batch)
        tag_logits = tag_logits.permute(0, 2, 1)  # must move classes to second dimension
        pos_logits = pos_logits.permute(0, 2, 1)
#         logger.info(f'TAG shapes: {tags.shape} / {tag_logits.shape}')
#         logger.info(f'POS shapes: {pos.shape} / {pos_logits.shape}')
        tag_loss = F.cross_entropy(tag_logits, tags, ignore_index=-1)
        pos_loss = F.cross_entropy(pos_logits, pos, ignore_index=-1)
#         logger.info(f"TAGS: {tags.shape}")
#         logger.info(f"LOGITS: {logits.shape}")
#         logits = logits.reshape(-1, logits.shape[-1])
#         tags = tags.view(-1)
#         logger.info(f"TAGS: {tags.shape}")
#         logger.info(f"LOGITS: {logits.shape}")
#         loss = F.nll_loss(F.log_softmax(logits, dim=1), tags, ignore_index=-1)
        return tag_loss, pos_loss
        
    def training_step(self, batch, batch_idx):
        tag_loss, pos_loss = self._shared_step(batch)
        if self.multitask:
            tag_weight = self.loss_ratio
            pos_weight = 1.0 - self.loss_ratio
            loss = ((tag_loss * tag_weight) + (pos_loss * pos_weight))
        else:
            loss = tag_loss
        self.log('training_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        tag_loss, pos_loss = self._shared_step(batch)
        loss = tag_loss
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
        
    def test_step(self, batch, batch_idx):
        ids = batch['id']
        logits = self(batch)
        lengths = batch['length']
        self.test_preds['ids'].extend(ids)
        self.test_preds['logits'].extend(logits)
        self.test_preds['lengths'].extend(lengths)
        
    def process_logits(self, logits, idx2int):
        preds = torch.stack(logits)
        preds = preds.argmax(dim=1).tolist()
        preds = [idx2int[p] for p in preds]
        return preds
            
    def init_hidden(self, batch_size):
#         return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cpu')
        return torch.normal(mean=0, std=1, size=(self.bidirectional * self.num_layers, batch_size, self.hidden_size)).to('cuda')
        
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.lr)

In [6]:
glove = TaggingPosDataModule._load_glove('../../data/glove.840B.300d.gz')

2021-04-14 21:51:29,147 - __main__ - INFO - Loading GloVe embeddings...
2021-04-14 21:52:20,476 - __main__ - INFO - Done!


In [7]:
tag_pos_dm = TaggingPosDataModule(embedding_obj=glove)
tag_pos_dm.prepare_data()
tag_pos_dm.setup('fit')
tag_labels = tag_pos_dm.tag_to_idx
pos_labels = tag_pos_dm.pos_to_idx

In [8]:
multitask = True
model = TaggingPosClassifier(num_labels=len(tag_labels), num_pos=len(pos_labels), lr=1e-4, hidden_size=1024, dropout=0, num_layers=2, loss_ratio=0.75, multitask=multitask)

In [None]:
if multitask:
    logging_dir = Path('.').joinpath('tagging_mt_lightning_logs')
    filename = 'tagging_mt-{epoch:02d}-{training_loss_epoch:.2f}-{val_loss_epoch:.2f}'
else:
    logging_dir = Path('.').joinpath('tagging_lightning_logs')
    filename = 'tagging-{epoch:02d}-{training_loss_epoch:.2f}-{val_loss_epoch:.2f}'
print(str(logging_dir.resolve()))

checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    filename=filename,
    save_top_k=3,
    mode='min',
)
trainer = pl.Trainer(
    
#     auto_lr_find=True,
    gpus=None,
#     gradient_clip_val=1,
    weights_summary='full',
#     precision=16,
#     track_grad_norm=2,
    default_root_dir=str(logging_dir.resolve()),
#     checkpoint_callback=False
    callbacks=[EarlyStopping(monitor='val_loss_epoch'), checkpoint_callback],
)

In [None]:
trainer.fit(model, datamodule=tag_pos_dm)

# Testing

'/home/dclian/adl/hw1/notebooks/tagging_mt_lightning_logs/lightning_logs/version_0/checkpoints/tagging_mt-epoch=13-training_loss=0.03-val_loss=0.09.ckpt'

In [241]:
# best_tag_model = TaggingClassifier.load_from_checkpoint('./tagging_lightning_logs/lightning_logs/version_19/checkpoints/epoch=38-step=4445.ckpt')
print(checkpoint_callback.best_model_path)
best_tag_model = TaggingPosClassifier.load_from_checkpoint(checkpoint_callback.best_model_path)
# best_tag_model = TaggingClassifier.load_from_checkpoint('./tagging_lightning_logs/lightning_logs/version_23/checkpoints/tagging-epoch=26-training_loss=0.17-val_loss=0.29.ckpt')
best_tag_model.freeze()
best_tag_model.to('cuda')

# tag_dm = TaggingDataModule(embedding_obj=glove, batch_size=64)
# trainer = pl.Trainer(
    
# #     auto_lr_find=True,
#     gpus=1,
# #     gradient_clip_val=1,
#     weights_summary='full',
# #     track_grad_norm=2,
# #     default_root_dir=str(logging_dir.resolve()),
# #     callbacks=[EarlyStopping(monitor='val_loss')],
#     checkpoint_callback=False
    
# )
# trainer.test(best_tag_model, datamodule=tag_dm)
# test_preds = best_tag_model.test_preds

/home/dclian/adl/hw1/notebooks/tagging_mt_lightning_logs/lightning_logs/version_6/checkpoints/tagging_mt-epoch=28-training_loss_epoch=0.05-val_loss_epoch=0.11.ckpt


TaggingPosClassifier(
  (rnn): GRU(300, 1024, num_layers=2, batch_first=True, bidirectional=True)
  (hidden_to_labels): Linear(in_features=2048, out_features=9, bias=True)
  (hidden_to_pos): Linear(in_features=2048, out_features=58, bias=True)
  (dropout): Dropout(p=0, inplace=False)
)

In [285]:
tag_model = TaggingPosClassifier.load_from_checkpoint('./tagging_mt_lightning_logs/lightning_logs/version_6/checkpoints/tagging_mt-epoch=28-training_loss_epoch=0.05-val_loss_epoch=0.11.ckpt')
tag_model.eval()
tag_model.to('cuda')

TaggingPosClassifier(
  (rnn): GRU(300, 1024, num_layers=2, batch_first=True, bidirectional=True)
  (hidden_to_labels): Linear(in_features=2048, out_features=9, bias=True)
  (hidden_to_pos): Linear(in_features=2048, out_features=58, bias=True)
  (dropout): Dropout(p=0, inplace=False)
)

In [286]:
idx2label = {idx: label for label, idx in tag_labels.items()}
tag_pos_dm = TaggingPosDataModule(embedding_obj=glove, batch_size=32)
tag_pos_dm.prepare_data()
tag_pos_dm.setup(stage='fit')
val_dl = tag_pos_dm.val_dataloader()
labels = tag_pos_dm.tag_to_idx
preds = []
for batch in val_dl:
    for key, val in batch.items():
        if isinstance(val, torch.Tensor):
            batch[key] = val.to('cuda')
    logits = tag_model(batch)
    batch['logits'] = logits
    preds.append(batch)

torch.Size([32, 18, 300])
torch.Size([32, 15, 300])
torch.Size([32, 24, 300])
torch.Size([32, 22, 300])
torch.Size([32, 18, 300])
torch.Size([32, 17, 300])
torch.Size([32, 16, 300])
torch.Size([32, 33, 300])
torch.Size([32, 22, 300])
torch.Size([32, 17, 300])
torch.Size([32, 33, 300])
torch.Size([32, 22, 300])
torch.Size([32, 19, 300])
torch.Size([32, 16, 300])
torch.Size([32, 21, 300])
torch.Size([32, 13, 300])
torch.Size([32, 22, 300])
torch.Size([32, 18, 300])
torch.Size([32, 16, 300])
torch.Size([32, 27, 300])
torch.Size([32, 26, 300])
torch.Size([32, 28, 300])
torch.Size([32, 24, 300])
torch.Size([32, 30, 300])
torch.Size([32, 13, 300])
torch.Size([32, 33, 300])
torch.Size([32, 26, 300])
torch.Size([32, 18, 300])
torch.Size([32, 16, 300])
torch.Size([32, 21, 300])
torch.Size([32, 19, 300])
torch.Size([8, 13, 300])


In [294]:
print(len(preds))

32


In [None]:
processed = []
for batch in preds:
    length = batch['length']
#     tags = batch['tags']
    _id = batch['id']
    logits = batch['logits'][0]  # 0 index is tags, 1 is PoS
    tags = batch['tags']
    num_samples = len(length)
    
    for sample in range(num_samples):
        s_length = length[sample]
#         s_tags = tags[sample][:s_length]
        s_id = _id[sample][0]
        s_logits = logits[sample][:s_length]
        print(s_logits.shape)
        softmaxed = F.log_softmax(s_logits, dim=1).argmax(dim=1).tolist()
#         softmaxed = s_logits.argmax(dim=1).tolist()
        print(softmaxed)
        gold = [idx2label[i] for i in tags[sample][:s_length].tolist()]
        to_tags = [idx2label[i] for i in softmaxed]
        
        processed.append({
            'id': s_id,
#             'tags': s_tags,
            'length': s_length,
            'softmax': softmaxed,
            'preds': to_tags,
            'gold': gold
        })
        

In [341]:
idx = 4
preds = [p['preds'] for p in processed]
golds = [p['gold'] for p in processed]
# print(classification_report([processed[idx]['gold']], [processed[idx]['preds']], scheme=IOB2, mode='strict'))
print(classification_report(golds, preds, scheme=IOB2, mode='strict'))
# f1_score([processed[idx]['gold']], [processed[idx]['preds']])

              precision    recall  f1-score   support

        date       0.81      0.76      0.78       206
  first_name       0.94      0.93      0.94       102
   last_name       0.85      0.77      0.81        78
      people       0.72      0.69      0.70       238
        time       0.81      0.80      0.80       218

   micro avg       0.80      0.77      0.79       842
   macro avg       0.82      0.79      0.81       842
weighted avg       0.80      0.77      0.79       842



In [337]:
processed[4]['preds']

['O', 'O', 'B-date', 'I-date', 'I-date']

In [334]:
y_true = [['O', 'O', 'O']]
y_pred = [['O', 'O', 'O']]
print(classification_report(y_true, y_pred))

ValueError: max() arg is an empty sequence

In [213]:
# predictions = best_tag_model.process_logits(test_preds['logits'], idx2label)

In [304]:
tag_pos_dm = TaggingPosDataModule(embedding_obj=glove, batch_size=64)
tag_pos_dm.prepare_data()
# tag_dm.setup(stage='fit')
# val_dl = tag_dm.val_dataloader()
tag_pos_dm.setup(stage='test')
test_dl = tag_pos_dm.test_dataloader()
labels = tag_pos_dm.tag_to_idx
test_preds = []
for batch in test_dl:
    for key, val in batch.items():
        if isinstance(val, torch.Tensor):
            batch[key] = val.to('cuda')
    logits = best_tag_model(batch)
    batch['logits'] = logits
    test_preds.append(batch)

In [282]:
preds[0]

{'id': [['test-0',
   'test-0',
   'test-0',
   'test-0',
   'test-0',
   'test-0',
   'test-0',
   'test-0'],
  ['test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1',
   'test-1'],
  ['test-2', 'test-2', 'test-2', 'test-2', 'test-2', 'test-2'],
  ['test-3', 'test-3', 'test-3', 'test-3', 'test-3'],
  ['test-4', 'test-4', 'test-4', 'test-4'],
  ['test-5',
   'test-5',
   'test-5',
   'test-5',
   'test-5',
   'test-5',
   'test-5',
   'test-5',
   'test-5'],
  ['test-6', 'test-6', 'test-6', 'test-6'],
  ['test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7',
   'test-7'],
  ['test-8', 'test-8', 'test-8', 'test-8', 'test-8', 'test-8', 'test-8'],
  ['test-9',
   'test-9',
   'test-9',
   'test-9',
   'test-9',
   'test-9',
   'test-9',
   'test-9'],
  ['test-10',
   'test-10',
   't

In [163]:
batch.keys()

dict_keys(['id', 'tokens', 'length', 'logits'])

In [310]:
processed_test = []
for batch in test_preds:
    length = batch['length']
#     tags = batch['tags']
    _id = batch['id']
    logits = batch['logits']
    print(logits.shape)
    num_samples = len(length)
    print(num_samples)
    
    for sample in range(num_samples):
        s_length = length[sample]
#         s_tags = tags[sample][:s_length]
        s_id = _id[sample][0]
        s_logits = logits[sample][:s_length]
        print(s_logits.shape)
        softmaxed = F.log_softmax(s_logits, dim=1).argmax(dim=1).tolist()
#         softmaxed = s_logits.argmax(dim=1).tolist()
        to_tags = [idx2label[i] for i in softmaxed]       
        
        processed_test.append({
            'id': s_id,
#             'tags': s_tags,
            'length': s_length,
            'softmax': softmaxed,
            'preds': to_tags
        })
        

torch.Size([64, 23, 9])
64
torch.Size([8, 9])
torch.Size([12, 9])
torch.Size([6, 9])
torch.Size([5, 9])
torch.Size([4, 9])
torch.Size([9, 9])
torch.Size([4, 9])
torch.Size([15, 9])
torch.Size([7, 9])
torch.Size([8, 9])
torch.Size([12, 9])
torch.Size([10, 9])
torch.Size([4, 9])
torch.Size([4, 9])
torch.Size([21, 9])
torch.Size([2, 9])
torch.Size([3, 9])
torch.Size([2, 9])
torch.Size([22, 9])
torch.Size([11, 9])
torch.Size([4, 9])
torch.Size([8, 9])
torch.Size([11, 9])
torch.Size([8, 9])
torch.Size([8, 9])
torch.Size([6, 9])
torch.Size([1, 9])
torch.Size([6, 9])
torch.Size([5, 9])
torch.Size([12, 9])
torch.Size([8, 9])
torch.Size([7, 9])
torch.Size([22, 9])
torch.Size([6, 9])
torch.Size([4, 9])
torch.Size([3, 9])
torch.Size([21, 9])
torch.Size([5, 9])
torch.Size([11, 9])
torch.Size([17, 9])
torch.Size([7, 9])
torch.Size([9, 9])
torch.Size([11, 9])
torch.Size([6, 9])
torch.Size([4, 9])
torch.Size([18, 9])
torch.Size([9, 9])
torch.Size([1, 9])
torch.Size([14, 9])
torch.Size([1, 9])
torch.S

In [227]:
VERSION_NO = 'version_6'
if multitask:
    with open(f'../preds/slot_mt_preds_{VERSION_NO}.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'tags'])
        for p in processed:
            _id = p['id']
            tags = " ".join(p['preds'])
            writer.writerow([_id, tags])
else:
    with open(f'../preds/slot_preds_{VERSION_NO}.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'tags'])
        for p in processed:
            _id = p['id']
            tags = " ".join(p['preds'])
            writer.writerow([_id, tags])

In [217]:
processed[20]

{'id': 'test-20',
 'length': tensor(4, device='cuda:0'),
 'softmax': [4, 6, 1, 1],
 'preds': ['O', 'B-date', 'I-date', 'I-date']}

In [242]:
trainer.test(model, datamodule=tag_pos_dm)

Testing: 100%|██████████| 59/59 [00:02<00:00, 27.92it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{}
--------------------------------------------------------------------------------


[{}]

In [245]:
model.test_preds['logits'][0].shape

torch.Size([23, 9])

In [31]:
with open('../ADL21-HW1/data/slot/eval.json') as f:
    ev = json.load(f)

In [32]:
ev[0]

{'tokens': ['i', 'prefer', 'a', 'table', 'outdoors'],
 'tags': ['O', 'O', 'O', 'O', 'O'],
 'id': 'eval-0'}

In [305]:
write_preds_to_csv(task='tagging', ids=list(chain.from_iterable(test_preds['ids'])), preds=predictions, fpath='../ADL21-HW1/data/slot/slot_preds.csv')

2021-04-01 12:56:35,385 - __main__ - INFO - Intent predictions written to ../ADL21-HW1/data/slot/slot_preds.csv


{'I-people': 0,
 'I-date': 1,
 'B-last_name': 2,
 'B-time': 3,
 'O': 4,
 'B-people': 5,
 'B-date': 6,
 'B-first_name': 7,
 'I-time': 8}

In [307]:
with open('../ADL21-HW1/data/slot/test.json') as f:
    test = json.load(f)
with open('../ADL21-HW1/data/slot/slot_preds.csv') as f:
    reader = csv.reader(f)
    header = next(reader)
    preds = [row[1] for row in reader]

In [308]:
idxs = []
for idx, (pred, t) in enumerate(zip(preds, test)):
    if len(t['tokens']) != len(pred.split()):
        idxs.append(idx)

In [309]:
idxs

[]

In [310]:
i = 1
print(test[i]['tokens'], len(test[i]['tokens']))
print(preds[i], len(preds[i].split()))

['can', 'i', 'book', 'a', 'outside', 'table', 'for', '3', 'days', 'for', '11:30', 'am'] 12
B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name B-first_name 12


In [297]:
for i in range(10):
    print(test[i]['tokens'])
    print(preds[i])

['i', 'want', 'it', 'for', '4', 'people', 'at', '10:30am']
O O O O B-people I-people O O
['can', 'i', 'book', 'a', 'outside', 'table', 'for', '3', 'days', 'for', '11:30', 'am']
O O O O O O O O O O O O
['i', 'require', 'a', 'table', 'for', '13']
O O O O O O
['do', 'you', 'have', 'disabled', 'access']
O O O O B-time
['last_name', 'first_name', 'joseph', 'vitello']
I-time O B-time O
['people', 'how', 'about', '2', 'of', 'us', 'and', 'seated', 'together']
O O O O O O O O O
['people', 'for', 'three', 'people']
O O O O
['i', 'am', 'calling', 'to', 'make', 'a', 'cancellation', 'i', 'booked', 'it', 'under', 'the', 'name', 'junko', 'takeshita']
O O O B-time O O O O O O O O O O O
['i', 'am', 'booked', 'for', 'thursday', 'aug', '30th']
O O O O O O O
['i', 'need', 'seating', 'on', 'sun', '19', 'aug', '2018']
O O O O O O O O


In [12]:
s1 = "I like cats."
s2 = "What's for dinner tonight?"
s3 = "What have you been up to lately?"

In [13]:
nlp = English()
tokenizer = nlp.tokenizer
tokens = []
lengths = []
# for s in nlp.pipe([s1, s2, s3], disable=['tagger', 'parser', 'ner', 'textcat']):
for s in [s1, s2, s3]:
    toks = list(tokenizer(s))
    print(toks, len(toks))
    toks = [torch.from_numpy(glove.get(t.text)).float() for t in toks]
    lengths.append(len(toks))
    tokens.append(torch.stack(toks))

[I, like, cats, .] 4
[What, 's, for, dinner, tonight, ?] 6
[What, have, you, been, up, to, lately, ?] 8


In [14]:
tokens[0].shape

torch.Size([4, 300])

In [15]:
padded = pad_sequence(tokens, batch_first=True)
padded.shape

torch.Size([3, 8, 300])

In [16]:
packed = pack_padded_sequence(padded, batch_first=True, lengths=lengths, enforce_sorted=False)
packed

PackedSequence(data=tensor([[-0.2063,  0.3672, -0.0719,  ...,  0.1427,  0.5006,  0.0380],
        [-0.2063,  0.3672, -0.0719,  ...,  0.1427,  0.5006,  0.0380],
        [ 0.1941,  0.2260, -0.4376,  ...,  0.0920,  0.3863,  0.1174],
        ...,
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065],
        [-0.1847, -0.0507, -0.2266,  ..., -0.4082, -0.2069, -0.1493],
        [-0.0869,  0.1916,  0.1091,  ..., -0.0152,  0.1111,  0.2065]]), batch_sizes=tensor([3, 3, 3, 3, 2, 2, 1, 1]), sorted_indices=tensor([2, 1, 0]), unsorted_indices=tensor([2, 1, 0]))

In [21]:
gru = nn.GRU(300, 20, batch_first=True, num_layers=3, bidirectional=True)
to_labels = nn.Linear(40, 5)
v = torch.randn(3, 10, 5)
y = torch.randint(high=5, size=(3, 10))
out, hidden = gru(packed)
print(hidden.shape)
out, out_lens = pad_packed_sequence(out, batch_first=True)
print(out.shape)
out = to_labels(out)
out = out.permute(0, 2, 1)
print(out.shape)
print(y.shape)

torch.Size([6, 3, 20])
torch.Size([3, 8, 40])
torch.Size([3, 5, 8])
torch.Size([3, 10])


In [126]:
repad = pad_packed_sequence(out, batch_first=True)

In [130]:
repad[0][0,:4,:]

tensor([[ 1.5605e-01, -2.6099e-01, -3.0838e-02,  1.2341e-01, -4.4563e-02,
          3.2513e-02, -6.6367e-02, -4.4984e-02, -1.7640e-01,  1.4038e-01,
          1.2488e-01, -1.9052e-02,  9.4344e-02,  8.7463e-02,  6.5304e-02,
         -6.2826e-02,  5.3467e-02,  2.0741e-01,  5.1490e-02,  6.2634e-02,
         -2.4891e-01,  7.0459e-02,  1.0758e-01,  1.4629e-01, -5.3113e-02,
          1.9575e-03, -4.0004e-01, -7.4937e-02, -1.1238e-01, -8.9180e-02,
         -2.9598e-01, -1.8928e-01, -2.0572e-01, -1.9954e-01, -1.1982e-01,
         -2.6132e-01,  2.0621e-01,  3.0803e-01, -9.3074e-02, -9.0306e-02],
        [ 2.2690e-01, -3.9751e-01, -2.5597e-02,  1.4637e-01, -1.6102e-02,
          3.1110e-02, -9.2124e-02, -1.1040e-01, -2.9708e-01,  2.5622e-01,
          1.6872e-01,  2.1500e-02,  2.1497e-01,  1.7024e-01,  1.6807e-01,
         -6.2749e-02,  7.9430e-02,  3.6981e-01,  1.8130e-02,  7.4128e-02,
         -1.6896e-01,  6.5061e-02,  1.6951e-01,  1.5283e-01, -2.8648e-02,
         -3.7681e-02, -3.2653e-01, -2

In [108]:
window = 2
missing = 0
l = [1, 2, 3, 4, 5, 6, 7]
print(f"MISSING: {l[missing]}")
l[max(missing-window, 0):missing] + l[missing + 1: missing+1+window]

MISSING: 1


[2, 3]

In [134]:
l = torch.stack([torch.rand(10) if np.random.choice([1, 0]) else torch.zeros(10) for i in range(5)])
l.shape

torch.Size([5, 10])

In [135]:
l[l.nonzero(as_tuple=True)].view(-1, l.shape[1]).shape

torch.Size([1, 10])

In [31]:
torch.mean(torch.rand(2, 10), dim=0).shape

torch.Size([10])

In [139]:
l[1:3] + l[3: 5]

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.0579, 0.1714, 0.8727, 0.9940, 0.8213, 0.2101, 0.7483, 0.0440, 0.3493,
         0.7820]])

In [39]:
def add_one(l):
    l.append(1)
    return l

In [40]:
add_one([1, 2, 3])

[1, 2, 3, 1]

In [32]:
y = torch.FloatTensor([
    [10, 100, 1, 3],
    [100, 5, 101, 4],
    [1, 2, 3, 100],
    [10, 12, 100, 1],
    [100, 1, 2, 3],
    [200, 300, 1, 500]
])
y = y.unsqueeze(0)
y = y.repeat(2, 1, 1)

In [33]:
y.shape

torch.Size([2, 6, 4])

In [34]:
y_hat = torch.LongTensor([
    1, 2, 3, 2, 0, -1
])
y_hat = y_hat.unsqueeze(0)
y_hat = y_hat.repeat(2, 1)

In [35]:
F.cross_entropy(y.view(-1, 4), y_hat.view(-1), ignore_index=-1)

tensor(0.0627)

In [139]:
y.view(-1, 4).shape

torch.Size([6, 4])

In [142]:
y.repeat(2, 1, 1).shape

torch.Size([2, 6, 4])

In [146]:
y_hat

tensor([[ 1,  2,  3,  2,  0, -1],
        [ 1,  2,  3,  2,  0, -1]])

In [45]:
if any().repeat(1):
    print('true')

true


In [104]:
torch.isnan(torch.tensor([1, float('nan'), 2])).sum() 

tensor(1)

In [315]:
A = np.array([
    [2, 1],
    [0, 3]
])
M = np.array([
    [3, 4],
    [2, 0]
])
M_inv = np.linalg.inv(M)

In [314]:
np.linalg.eigvals(A)

array([2., 3.])

In [321]:
np.linalg.eigvals(M_inv @ A @ M)

array([2., 3.])

In [322]:
M_inv @ A @ M

array([[ 3.  ,  0.  ],
       [-0.25,  2.  ]])

In [120]:
with open('../ADL21-HW1/data/slot/train.json') as f:
    train = json.load(f)

In [135]:
tokens = train[5448]