In [189]:
import json
from collections import Counter

import torch
import numpy as np

from spacy.lang.en import English
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

from torch.utils.data import Dataset, DataLoader, random_split
from torch import nn
from tqdm.notebook import tqdm

In [None]:
with open('data/labels.json', 'r') as f:
    labels = json.load(f)

In [309]:
class SkillDataset(torch.utils.data.Dataset):
    
    def __init__(self, path2data: str, entities2id: dict):
        self.data = self.__load_data(path2data)
        self.softmax = nn.Softmax(dim=1)
        self.entities2id = entities2id
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        """ shape each sample into a proper """
        sample = self.data[index]
        
        x_text = sample['previous_text']
        x_midas = self.__norm_midas(sample['midas_vectors'])
        x_entities = [[ent['label'] for ent in ut] for ut in sample['previous_entities']]
        x_entities = self.__ohencode(x_entities)
        
        y_midas = self.data[index]['predict']['midas']
        y_entity = self.data[index]['predict']['entity']['label']
        
        return (x_text, x_midas, x_entities), [y_midas, y_entity]
    
    def __load_data(self, path: str) -> dict:
        """ loads data from a json file """
        with open(path, 'r') as f:
            data = json.load(f)

        return data
    
    def __norm_midas(self, midas_vectors: list) -> np.array:
        """ 
        takes midas vectors of all sentences in the utterance
        averages them and then applies softmax
        """
        vecs = np.zeros((len(midas_vectors), 13))
        
        for i, vec in enumerate(midas_vectors):
            # calc mean probability per each midas labels
            vecs[i] = np.mean(np.array(vec), axis=0)
        
        # return normalized
        return self.softmax(torch.Tensor(vecs))
    
    def __tokenize(self, texts) -> list:
        """ transform list of strings into a list of list of tokens using spaCy """
        return [[token.lower_ for token in self.tokenizer(ut)] for ut in texts]
    
    def __ohencode(self, entities) -> torch.Tensor:
        """ one-hot encoding of entities per each sample """
        ohe_vec = np.zeros((len(entities), len(self.entities2id)))
        
        for i, ut in enumerate(entities):
            for ent in ut:
                ohe_vec[i][self.entities2id[ent]] += 1
                
        return torch.Tensor(ohe_vec)

In [310]:
dataset = SkillDataset('data/dataset.json', labels['entities2id'])
len(dataset)

10565

In [314]:
print(dataset[36][0][0], '\n') # x_text
print(dataset[36][0][1], '\n') # x_midas
print(dataset[36][0][2], '\n') # x_entitites
print(dataset[36][1], '\n') # y labels

["It's something else, that's for sure.", "In 1934 I see that North Dakota's governor declared martial law and seceded from the US.  I'm learning all kinds of new things today.", 'Wow! Well, Texas elected their first female Governor, the second in the nation, before some states even ratified the 19th amendment. '] 

tensor([[0.0695, 0.0704, 0.0703, 0.0697, 0.0696, 0.0699, 0.0699, 0.0695, 0.1556,
         0.0695, 0.0700, 0.0767, 0.0696],
        [0.0690, 0.0694, 0.0694, 0.0694, 0.0691, 0.0691, 0.0691, 0.0690, 0.0748,
         0.0690, 0.0691, 0.1646, 0.0690],
        [0.0860, 0.0713, 0.0821, 0.0738, 0.0716, 0.0708, 0.0709, 0.0709, 0.0748,
         0.0708, 0.0711, 0.1150, 0.0709]]) 

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 2., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 3., 1., 0., 0., 1., 0., 0.,

In [315]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(
    dataset, [train_size, test_size], 
    # fix the generator for reproducible results
    generator=torch.Generator().manual_seed(42))

In [316]:
len(train_dataset), len(test_dataset)

(8452, 2113)

In [322]:
class Collator():
    """
    A class to preprocess batch to pass it to the model
    
    params:
    tokenizer: Spacy tokenizer
    vectorizer: sklearn TfIdfVectorizer (pretrained)
    encoder: sklearn OneHotEncoder with preloaded categories
    """
    def __init__(self, tokenizer= None, vectorizer=None):
        self.tokenizer = tokenizer
        self.vectorizer = vectorizer
        
    def collate_fn(self, batch) -> tuple:
        """ preprocess batch for the model """
        return batch
    
collator = Collator()

In [323]:
train_loader = DataLoader(
    train_dataset, batch_size=2, 
    shuffle=False, collate_fn=collator.collate_fn)

In [324]:
for batch in train_loader:
    break

In [325]:
batch

[((['yeah i like to watch her play. Polo shirts were originally invented for tennis by famous player rene "the crocodile" lacoste. i love their colgne for men',
    'Intresting, the longest match played in a polo shirt was 22 hours',
    'yeah if i recall it went over three days or so'],
   tensor([[0.0698, 0.0701, 0.0699, 0.0698, 0.0699, 0.0698, 0.0699, 0.0698, 0.1363,
            0.0698, 0.0700, 0.0949, 0.0699],
           [0.0695, 0.0697, 0.0698, 0.0695, 0.0695, 0.0694, 0.0695, 0.0694, 0.0819,
            0.0694, 0.0694, 0.1537, 0.0694],
           [0.0684, 0.0684, 0.0686, 0.0683, 0.0684, 0.0683, 0.0683, 0.0683, 0.0699,
            0.0684, 0.0684, 0.1779, 0.0683]]),
   tensor([[2., 2., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
           [0., 0., 0., 2., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
            0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
           [0., 1., 0., 0., 0., 1., 0., 0., 0., 