In [1]:
import gzip
import logging
import json
from pathlib import Path
from typing import Dict, Optional

import spacy
import pickle
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch
from torch.optim import Adam
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.manual_seed(420)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [4]:
def build_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
    logger.info("Loading Glove embeddings...")
    glove = {}
    with open(fpath) as f:
        for line in f:
            values = line.split()
            word = ''.join(values[:-300])
            vector = torch.FloatTensor([float(v) for v in values[-300:]])
            glove[word] = vector
            
    logger.info("Glove embeddings loaded.")
    return glove

def build_intent_mappings(fpath: str, save=False):
    with open(fpath) as f:
        data = json.load(f)
    
    intents = list(set([i["intent"] for i in data]))
    intents_to_idx = {intent: idx for idx, intent in enumerate(intents)}
    
    if save:
        with open('../data/intents_to_idx.json', 'w') as f:
            json.dump(intents_to_idx, f, ensure_ascii=False, indent=4)
    
    return intents_to_idx

In [68]:
# build_intent_mappings('../ADL21-HW1/data/intent/train.json', save=True)

{'no': 0,
 'schedule_meeting': 1,
 'goodbye': 2,
 'interest_rate': 3,
 'report_lost_card': 4,
 'rewards_balance': 5,
 'what_can_i_ask_you': 6,
 'plug_type': 7,
 'credit_score': 8,
 'uber': 9,
 'min_payment': 10,
 'rollover_401k': 11,
 'damaged_card': 12,
 'share_location': 13,
 'book_flight': 14,
 'insurance': 15,
 'how_busy': 16,
 'pay_bill': 17,
 'calendar': 18,
 'direct_deposit': 19,
 'carry_on': 20,
 'income': 21,
 'jump_start': 22,
 'calories': 23,
 'measurement_conversion': 24,
 'cancel_reservation': 25,
 'confirm_reservation': 26,
 'tire_pressure': 27,
 'traffic': 28,
 'meeting_schedule': 29,
 'pto_used': 30,
 'smart_home': 31,
 'lost_luggage': 32,
 'weather': 33,
 'time': 34,
 'pto_request_status': 35,
 'timezone': 36,
 'calendar_update': 37,
 'thank_you': 38,
 'whisper_mode': 39,
 'mpg': 40,
 'shopping_list': 41,
 'balance': 42,
 'user_name': 43,
 'bill_due': 44,
 'do_you_have_pets': 45,
 'what_are_your_hobbies': 46,
 'food_last': 47,
 'who_made_you': 48,
 'change_accent': 49,

# Datasets

In [12]:
class IntentDataset(Dataset):
    def __init__(self, data_path: str, train: bool, intent_mapping: Dict[str, int], glove: Optional[Dict[str, torch.Float]] = None, glove_path: str = "../../data/glove.840B.300d.gz", unk_token_strategy='ignore'):
        with open(data_path) as f: 
            self.data = json.load(f)
        self.intent_to_idx = intent_mapping
        self.train = train
        self.glove = glove
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, idx):
        sample = self.data[idx]
        _id = sample['id']
        text = sample['text']
        text = self.convert_to_vectors(text)
        out = {
            'id': _id,
            'text': text
        }
        if self.train:
            intent = sample['intent']
            intent = self.intent_to_idx[intent]
            out['intent'] = intent
        return out
        
    def convert_to_vectors(self, text):
        vectors = []
        if self.unk_token_strategy == 'ignore':
            for idx, tok in enumerate(text.split()):
                try:
                    vector = self.glove[tok]
                except KeyError:
                    continue
                else:
                    vectors.append(vector)
        return torch.stack(vectors)
            
        
class IntentDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "../ADL21-HW1/data/intent", intent_mapping: str = "../data/intents_to_idx.json", embedding_obj=None, embedding_dir: str = "../../data/glove.840B.300d.gz", batch_size: int = 32):
        super().__init__()
        self.data_dir = Path(data_dir)
        self.batch_size = batch_size
        with open(intent_mapping) as f:
            self.intent_to_idx = json.load(f)
        if embedding_obj:
            self.emb = embedding_obj
        else:
            self.emb = self._load_glove(embedding_dir)
        
    def setup(self, stage: Optional[str] = None):
        if stage == "fit" or stage is None:
            self.intent_train = IntentDataset(
                data_path=self.data_dir.joinpath('train.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
            self.intent_val = IntentDataset(
                data_path=self.data_dir.joinpath('eval.json'), 
                train=True,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
        elif stage == "test" or stage is None:
            self.intent_test = IntentDataset(
                data_path=self.data_dir.joinpath('test.json'), 
                train=False,
                intent_mapping=self.intent_to_idx, 
                glove=self.emb
            ) 
        
    def train_dataloader(self):
        return DataLoader(self.intent_train, batch_size=self.batch_size, num_workers=8, pin_memory=True)

    def val_dataloader(self):
        return DataLoader(self.intent_val, batch_size=self.batch_size, num_workers=8, pin_memory=True)

    def test_dataloader(self):
        return DataLoader(self.intent_test, batch_size=self.batch_size, num_workers=8, pin_memory=True)
        
    @staticmethod
    def _load_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
        logger.info("Loading GloVe embeddings...")
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Done!")
        return emb
        

# Model

In [6]:
class IntentClassifier(pl.LightningModule):
    def __init__(self, num_labels, hidden_size=128, num_layers=3, bidirectional=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = 2 if bidirectional else 1
        self.lr = None
        self.rnn = nn.GRU(
            input_size=300, 
            hidden_size=hidden_size, 
            num_layers=num_layers, 
            bidirectional=bidirectional, 
            batch_first=True
        )
        self.hidden_to_labels = nn.Linear(hidden_size * num_layers * bidirectional, num_labels)
        self.save_hyperparameters()
        
    def forward(self, input):
        batch_size = input.shape[0]
        hidden = torch.normal(mean=0, std=1, size=(batch_size, self.bidirectional * num_layers, self.hidden_size))
        out, hidden = self.rnn(input, hidden)
        logits = self.hidden_to_labels(hidden.view(batch_size, -1))
        return logits
        
    def _shared_step(self, batch):
        ids = batch['id']
        text = batch['text']
        intent = batch['intent']
        logits = self(text)
        loss = F.cross_entropy(logits, intent)
        return loss
        
    def training_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('training_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss = self._shared_step(batch)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
        
    def test_step(self, batch, batch_idx):
        ids = batch['id']
        text = batch['text']
        intent = batch['intent']
        logits = self(text)
        
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=(self.lr))

In [13]:
glove = IntentDataModule._load_glove('../../data/glove.840B.300d.gz')

2021-03-21 23:38:01,072 - __main__ - INFO - Loading GloVe embeddings...
2021-03-21 23:40:25,208 - __main__ - INFO - Done!


In [15]:
intent_dm = IntentDataModule(embedding_obj=glove)

In [8]:
intent_labels = intent_dm.intent_to_idx
print(intent_labels)

{'no': 0, 'schedule_meeting': 1, 'goodbye': 2, 'interest_rate': 3, 'report_lost_card': 4, 'rewards_balance': 5, 'what_can_i_ask_you': 6, 'plug_type': 7, 'credit_score': 8, 'uber': 9, 'min_payment': 10, 'rollover_401k': 11, 'damaged_card': 12, 'share_location': 13, 'book_flight': 14, 'insurance': 15, 'how_busy': 16, 'pay_bill': 17, 'calendar': 18, 'direct_deposit': 19, 'carry_on': 20, 'income': 21, 'jump_start': 22, 'calories': 23, 'measurement_conversion': 24, 'cancel_reservation': 25, 'confirm_reservation': 26, 'tire_pressure': 27, 'traffic': 28, 'meeting_schedule': 29, 'pto_used': 30, 'smart_home': 31, 'lost_luggage': 32, 'weather': 33, 'time': 34, 'pto_request_status': 35, 'timezone': 36, 'calendar_update': 37, 'thank_you': 38, 'whisper_mode': 39, 'mpg': 40, 'shopping_list': 41, 'balance': 42, 'user_name': 43, 'bill_due': 44, 'do_you_have_pets': 45, 'what_are_your_hobbies': 46, 'food_last': 47, 'who_made_you': 48, 'change_accent': 49, 'change_ai_name': 50, 'transfer': 51, 'taxes': 5

In [16]:
model = IntentClassifier(num_labels=len(intent_labels))

In [17]:
trainer = pl.Trainer(
    auto_lr_find=True,
    gpus=1,
    callbacks=[EarlyStopping(monitor='val_loss')])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [18]:
trainer.fit(model, datamodule=intent_dm)

TypeError: '<=' not supported between instances of 'float' and 'NoneType'

In [87]:
glove = IntentDataset._load_glove('../../data/glove.840B.300d.gz')

2021-03-21 02:06:47,328 - __main__ - INFO - Loading GloVe embeddings...
2021-03-21 02:08:37,989 - __main__ - INFO - Done!


In [104]:
ds = IntentDataset("../ADL21-HW1/data/intent/train.json", "../data/intents_to_idx.json", glove_obj=glove )

In [100]:
l = [torch.randn(3) for _ in range(5)]

In [107]:
s = next(iter(ds))

In [122]:
torch.normal(mean=0, std=1, size=(5, 25))

tensor([[-0.0324, -0.7132, -1.3052,  0.7677, -0.0789,  1.6746, -0.0386, -1.2344,
          0.1433, -0.4265, -0.0532, -0.6045,  0.1797, -0.7614, -0.0978,  1.2656,
          0.7109,  1.1740, -0.2431,  0.1102, -0.9642,  1.2626,  0.8538,  0.8802,
         -0.2327],
        [-1.8271,  0.1353, -0.4879, -0.1387, -0.9153,  0.8262,  1.2216, -0.7418,
         -0.2891,  0.3275,  1.4131, -1.1525,  1.4058,  0.2956,  0.0996, -0.6835,
          0.3763, -0.0179, -1.4679, -1.7368,  0.2121, -0.0423, -1.5612,  0.0631,
         -1.1917],
        [ 1.6430, -2.3248,  0.3844,  1.2961,  1.0659,  1.5517, -0.0344,  0.2015,
          0.6523, -0.7296, -0.7293, -0.3694,  0.0523, -1.3394, -0.8691,  0.2236,
         -0.8475,  0.2824, -0.6079, -0.2165, -0.8867,  0.1901, -0.8858,  0.3700,
          1.1296],
        [-0.3087,  0.1910,  0.6390,  0.5137, -1.3382,  0.4214,  0.4018, -0.7678,
         -0.6880,  1.6784,  1.0070,  0.2627, -0.4721, -1.1159,  0.6519,  1.9695,
          0.9160,  0.2786,  0.0575, -0.3378, -0.7683

In [99]:
glove['jump'].shape

torch.Size([300])

In [47]:
g = build_glove('../../data/glove.840B.300d.txt')

2021-03-18 00:23:09,648 - __main__ - INFO - Loading Glove embeddings...
2021-03-18 00:25:02,845 - __main__ - INFO - Glove embeddings loaded.


In [50]:
with gzip.open('../../data/glove.840B.300d.gz', 'wb') as f:
    pickle.dump(g, f)

In [17]:
ft = fasttext.load_model('../../data/crawl-300d-2M-subword.bin')



In [12]:
fasttext.util.reduce_model(ft, 100)

<fasttext.FastText._FastText at 0x7f6c50bd7430>

In [13]:
ft.save_model('../../data/crawl-100d-2M-subword.bin')

In [14]:
ft = fasttext.load_model('../../data/crawl-100d-2M-subword.bin')



In [18]:
words = ft.get_words()
words[:50]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 57: unexpected end of data