In [58]:
import gzip
import logging
import json
from typing import Dict

import spacy
import pickle
import pytorch_lightning as pl
import torch
from torch.utils.data import Dataset, DataLoader

In [22]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

In [62]:
def build_glove(fpath: str) -> Dict[str, torch.FloatTensor]:
    logger.info("Loading Glove embeddings...")
    glove = {}
    with open(fpath) as f:
        for line in f:
            values = line.split()
            word = ''.join(values[:-300])
            vector = torch.FloatTensor([float(v) for v in values[-300:]])
            glove[word] = vector
            
    logger.info("Glove embeddings loaded.")
    return glove

def build_intent_mappings(fpath: str, save=False):
    with open(fpath) as f:
        data = json.load(f)
    
    intents = list(set([i["intent"] for i in data]))
    intents_to_idx = {intent: idx for idx, intent in enumerate(intents)}
    
    if save:
        with open('../data/intents_to_idx.json', 'w') as f:
            json.dump(intents_to_idx, f, ensure_ascii=False, indent=4)
    
    return intents_to_idx

In [63]:
build_intent_mappings('../ADL21-HW1/data/intent/train.json')

{'no': 0,
 'schedule_meeting': 1,
 'goodbye': 2,
 'interest_rate': 3,
 'report_lost_card': 4,
 'rewards_balance': 5,
 'what_can_i_ask_you': 6,
 'plug_type': 7,
 'credit_score': 8,
 'uber': 9,
 'min_payment': 10,
 'rollover_401k': 11,
 'damaged_card': 12,
 'share_location': 13,
 'book_flight': 14,
 'insurance': 15,
 'how_busy': 16,
 'pay_bill': 17,
 'calendar': 18,
 'direct_deposit': 19,
 'carry_on': 20,
 'income': 21,
 'jump_start': 22,
 'calories': 23,
 'measurement_conversion': 24,
 'cancel_reservation': 25,
 'confirm_reservation': 26,
 'tire_pressure': 27,
 'traffic': 28,
 'meeting_schedule': 29,
 'pto_used': 30,
 'smart_home': 31,
 'lost_luggage': 32,
 'weather': 33,
 'time': 34,
 'pto_request_status': 35,
 'timezone': 36,
 'calendar_update': 37,
 'thank_you': 38,
 'whisper_mode': 39,
 'mpg': 40,
 'shopping_list': 41,
 'balance': 42,
 'user_name': 43,
 'bill_due': 44,
 'do_you_have_pets': 45,
 'what_are_your_hobbies': 46,
 'food_last': 47,
 'who_made_you': 48,
 'change_accent': 49,

In [None]:
class IntentDataset(Dataset):
    def __init__(self, data_path: str, intent_mapping_path: str, glove_path: str = "../../data/glove.840B.300d.gz", unk_token_strategy='ignore'):
        (self.data, 
         self.intent_to_idx, 
         self.idx_to_intent) = self._load_intent_data(data_path, intent_mapping_path)
        self.glove = self._load_glove(glove_path)
        self.unk_token_strategy = unk_token_strategy
    
    def __len__(self):
        return len(self.data)
        
        
    def _load_intent_data(self, fpath: str, intent_mapping_path: str):
        with open(fpath) as f:
            data = json.load(f)
            
        with open(intent_mapping_path) as f:
            intent_to_idx = json.load(f)
            idx_to_intent = {idx: intent for intent, idx in intent_to_idx.items()}
        
        return data, intent_to_idx, idx_to_intent
            
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        text = sample['text']
        intent = sample['intent']
        _id = sample['id']
        
        text = self.convert_to_vectors(text)
        intent = self.intent_to_idx(intent)
        
        return {
            'id': _id
            'text': text
            'intent': intent
        }
        
    
    def convert_to_vectors(self, text):
        vectors = []
        if self.unk_token_strategy == 'ignore':
            for idx, tok in enumerate(text.split()):
                vector = self.glove.get(tok)
                if not vector:
                    continue
                else:
                    vectors.append(vector)
        return torch.Tensor(vectors)
            
    
        
        
    def _load_glove(self, fpath: str) -> Dict[str, torch.FloatTensor]:
        with gzip.open(fpath, 'rb') as f:
            emb = pickle.load(f)
        logger.info("Loaded GloVe embeddings.")
        return emb
        
class IntentDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str, batch_size: int = 64):
        super().__init__()
        

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [47]:
g = build_glove('../../data/glove.840B.300d.txt')

2021-03-18 00:23:09,648 - __main__ - INFO - Loading Glove embeddings...
2021-03-18 00:25:02,845 - __main__ - INFO - Glove embeddings loaded.


In [50]:
with gzip.open('../../data/glove.840B.300d.gz', 'wb') as f:
    pickle.dump(g, f)

In [17]:
ft = fasttext.load_model('../../data/crawl-300d-2M-subword.bin')



In [12]:
fasttext.util.reduce_model(ft, 100)

<fasttext.FastText._FastText at 0x7f6c50bd7430>

In [13]:
ft.save_model('../../data/crawl-100d-2M-subword.bin')

In [14]:
ft = fasttext.load_model('../../data/crawl-100d-2M-subword.bin')



In [18]:
words = ft.get_words()
words[:50]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 57: unexpected end of data

In [None]:
class IntentClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        
        