In [19]:
import re
import io
import sys
import spacy
import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric
from gensim.corpora import WikiCorpus
from gensim.models import FastText

from millenlp.helpers import nlp_utils

import torch
from torch import nn
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence


from tqdm import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

## Base FastText Wiki Data

In [2]:
%% bash
wget https://dumps.wikimedia.org/eswiki/latest/eswiki-latest-pages-articles.xml.bz2

UsageError: Cell magic `%%` not found.


In [11]:
def make_corpus(in_f, out_f):

    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, 'w')
    wiki = WikiCorpus(in_f)

    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')
    
make_corpus('eswiki-latest-pages-articles.xml.bz2', 'gensim_wiki_corpora')

Processed 10000 articles
Processed 20000 articles
Processed 30000 articles
Processed 40000 articles
Processed 50000 articles
Processed 60000 articles
Processed 70000 articles
Processed 80000 articles
Processed 90000 articles
Processed 100000 articles
Processed 110000 articles
Processed 120000 articles
Processed 130000 articles
Processed 140000 articles
Processed 150000 articles
Processed 160000 articles
Processed 170000 articles
Processed 180000 articles
Processed 190000 articles
Processed 200000 articles
Processed 210000 articles
Processed 220000 articles
Processed 230000 articles
Processed 240000 articles
Processed 250000 articles
Processed 260000 articles
Processed 270000 articles
Processed 280000 articles
Processed 290000 articles
Processed 300000 articles
Processed 310000 articles
Processed 320000 articles
Processed 330000 articles
Processed 340000 articles
Processed 350000 articles
Processed 360000 articles
Processed 370000 articles
Processed 380000 articles
Processed 390000 arti

In [8]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1
file_len('gensim_wiki_corpora')

1325626

In [37]:
def get_batch(n, file, length):
    with open(file) as f:
        batch = []
        for i, l in enumerate(f):
            if l:
                batch += [preprocess_string(l.lower(), [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric])]
                if (i+1)%n==0 or i == length - 1:
                    yield batch
                    batch = []

In [16]:
model = FastText(size=300, workers=4)
data_length = file_len('gensim_wiki_corpora')
batch_len = 10000
print("Total number of articules {}".format(data_length))
for i, batch in enumerate(get_batch(batch_len, 'gensim_wiki_corpora', data_length)):
    print("Training batch {} for the {} articles".format(i+1, (i+1)*batch_len))
    if i:
        model.build_vocab(batch, update=True)
    else:
        model.build_vocab(batch)
        
    model.train(batch, total_examples=len(batch), epochs=model.epochs)

Total number of articules 1325626
Training batch 1 for the 10000 articles
Training batch 2 for the 20000 articles
Training batch 3 for the 30000 articles
Training batch 4 for the 40000 articles
Training batch 5 for the 50000 articles
Training batch 6 for the 60000 articles
Training batch 7 for the 70000 articles
Training batch 8 for the 80000 articles
Training batch 9 for the 90000 articles
Training batch 10 for the 100000 articles
Training batch 11 for the 110000 articles
Training batch 12 for the 120000 articles
Training batch 13 for the 130000 articles
Training batch 14 for the 140000 articles
Training batch 15 for the 150000 articles
Training batch 16 for the 160000 articles
Training batch 17 for the 170000 articles
Training batch 18 for the 180000 articles
Training batch 19 for the 190000 articles
Training batch 20 for the 200000 articles
Training batch 21 for the 210000 articles
Training batch 22 for the 220000 articles
Training batch 23 for the 230000 articles
Training batch 24 

In [21]:
model.save("model/basefft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [19]:
len(model.wv.vocab)

517213

## Subred Model Complemented

In [2]:
model = FastText.load("model/basefft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [45]:
data_length = file_len('subred.txt')
batch_len = 10000
print("Total number of articules {}".format(data_length))
for i, batch in enumerate(get_batch(batch_len, 'subred.txt', data_length)):
    print("Training batch {} for the {} articles".format(i+1, (i+1)*batch_len))
    model.build_vocab(batch, update=True)
    model.train(batch, total_examples=len(batch), epochs=model.epochs)

Total number of articules 50004
Training batch 1 for the 10000 articles
Training batch 2 for the 20000 articles
Training batch 3 for the 30000 articles
Training batch 4 for the 40000 articles
Training batch 5 for the 50000 articles
Training batch 6 for the 60000 articles


In [46]:
len(model.wv.vocab)

517389

In [47]:
model.save("subred_model/subredfft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Model Comparative

### Data

In [20]:
class Embedding(object):
    """Embedding transform"""
    
    def __init__(self, model_loc, D_in):
        """
        Args:
            model_loc (string): Path to the embedding FastText
            D_in (intenger): 
        """
        self.D_in = D_in
        self.embedding_layer = FastText.load(model_loc)  
        self.size = self.embedding_layer.wv.vector_size
        
    def __call__(self, sample):
        tensor = torch.zeros(self.D_in, self.size)
        preprocessed_text = preprocess_string(sample['text'].lower(), [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric])
        preprocessed_text = [word for word in preprocessed_text if word in self.embedding_layer.wv]
        embedding = torch.tensor(self.embedding_layer.wv[preprocessed_text])
        tensor[:embedding.shape[0], :] = embedding[:embedding.shape[0] if embedding.shape[0] <= self.D_in else self.D_in]
        sample['tensor'] = tensor
        return sample


In [32]:
class Flatten(object):
    """Flatten transform"""
    
    def __init__(self):
        pass
    
    def __call__(self, sample):
        sample['tensor'] = sample['tensor'].view(-1)
        return sample

In [33]:
class IterationsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset_loc, indices, transform=None):
        """
        Args:
            dataset_loc (string): Path to the xlsx file with texts and annotations.
            indices (list): Indices used to take the iteration and label columns from the DataFrame [Iterations, Labels]
        """
        self.indices = indices
        self.dataset = pd.read_excel(dataset_loc)[indices]
        self.transform = transform
        self.labels =  self.dataset[self.indices[1]].unique().tolist()

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        
        sample = {'text': self.dataset.loc[idx, self.indices[0]], 'label': torch.tensor(self.labels.index(self.dataset.loc[idx, self.indices[1]]))}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

In [34]:
print('MEAN: {:.2f}'.format(dataset.dataset['PREGUNTAS'].apply(lambda x : len(x.split())).mean()))
print('STD: {:.2f}'.format(dataset.dataset['PREGUNTAS'].apply(lambda x : len(x.split())).std()))

MEAN: 11.26
STD: 9.55


### Linear Model

In [35]:
class FastextTwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out, size):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(FastextTwoLayerNet, self).__init__()
        self.D_in = D_in
        self.size = size
        self.linear1 = torch.nn.Linear(D_in * self.size, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    
    def forward(self, input_tensor):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(input_tensor).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [37]:
D_in, H, D_out, size = 30, 160, 4, 300

# Dataset object with transforms include
dataset = IterationsDataset('interacciones.xlsx', 
                            ['PREGUNTAS', 'INTENCION CORRECTA'], 
                            transform=transforms.Compose([
                                Embedding("subred_model/subredfft", 30),
                                Flatten()
                            ]))

# Generator to group the batches
dataloader = DataLoader(dataset, batch_size=30,
                        shuffle=True, num_workers=4)

model = FastextTwoLayerNet(D_in, H, D_out, size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for i_batch, batch in enumerate(dataloader):
    # Forward pass: Compute predicted y by passing x to the model
    preds = model.forward(batch['tensor'])

    # Compute and print loss
    training_loss = criterion(preds, batch['label'])
    print(i_batch, training_loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    training_loss.backward()
    optimizer.step()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


0 1.394147515296936
1 1.40174400806427
2 1.3419820070266724
3 1.4012501239776611
4 1.3670706748962402
5 1.3664566278457642
6 1.3524558544158936
7 1.3217461109161377
8 1.3086198568344116
9 1.3446156978607178
10 1.3024235963821411
11 1.3895207643508911
12 1.3186475038528442
13 1.3512918949127197
14 1.4281095266342163
15 1.364151120185852
16 1.382371187210083
17 1.351148009300232
18 1.405999779701233
19 1.3498706817626953
20 1.3687556982040405
21 1.3293641805648804
22 1.3499890565872192
23 1.3910714387893677
24 1.3418687582015991
25 1.397359848022461
26 1.421008586883545
27 1.4066191911697388
28 1.2973408699035645
29 1.3781263828277588
30 1.3546721935272217
31 1.3246434926986694
32 1.3462889194488525
33 1.343207836151123
34 1.3532249927520752
35 1.2992777824401855
36 1.3598978519439697
37 1.381897211074829
38 1.3332953453063965
39 1.353472352027893
40 1.3514389991760254
41 1.3649260997772217
42 1.333554744720459
43 1.4128493070602417
44 1.3817658424377441
45 1.3092044591903687
46 1.300883

In [38]:
def predict(text):
    iteration = dataset.transform({'text' : text})
    softmax = nn.Softmax(0)
    pred = softmax(model(iteration['tensor']))
    _, index = torch.max(pred, 0)
    probas = {label : pred[i].item() for i, label in enumerate(dataset.labels)}
    return dataset.labels[index], probas
    
predict('Necesito una cita de medicina general')

('asignacion',
 {'asignacion': 0.29861029982566833,
  'confirmacion': 0.21985498070716858,
  'cancelacion': 0.217831090092659,
  'agente': 0.26370370388031006})

### GRU MODEL

In [44]:
class GRULayerNet(nn.Module):
    def __init__(self, H, D_out, size):
        """
        Args:
            H : Number of hidden states
        """   

        super(GRULayerNet, self).__init__()
        self.D_in = D_in
        self.size = size
        self.gru =  nn.GRU(self.size, H, 1, batch_first=True, bidirectional=True)
        
    
    def forward(self, input_tensor):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.gru(input_tensor)

In [41]:
D_in, H, D_out, size = 30, 160, 4, 300

# Dataset object with transforms include
dataset = IterationsDataset('interacciones.xlsx', 
                            ['PREGUNTAS', 'INTENCION CORRECTA'], 
                            transform=transforms.Compose([
                                Embedding("subred_model/subredfft", 30)
                            ]))

dataloader = DataLoader(dataset, batch_size=30,
                        shuffle=True, num_workers=4)


# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

# for i_batch, batch in enumerate(dataloader):

#     # Forward pass: Compute predicted y by passing x to the model
#     preds = model.forward(batch['tensor'])

#     # Compute and print loss
#     training_loss = criterion(preds, batch['label'])
#     print(i_batch, training_loss.item())

#     # Zero gradients, perform a backward pass, and update the weights.
#     optimizer.zero_grad()
#     training_loss.backward()
#     optimizer.step()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [46]:
model = GRULayerNet(H, D_out, size)
model(dataset.transform({'text':'Necesito una cita de medicina general'}))

AttributeError: 'dict' object has no attribute 'size'

In [8]:
def predict_dataset(row):
    prediction, probas = predict(row['PREGUNTAS'])
    return [prediction] + [probas[key] for key in probas]

data = dataset.dataset.copy()
data[['prediction', 'asignacion', 'confirmacion', 'cancelacion', 'agente']] = data.progress_apply(lambda x : pd.Series(predict_dataset(x)), axis=1)
data

100%|██████████| 3324/3324 [00:03<00:00, 974.03it/s] 


Unnamed: 0,PREGUNTAS,INTENCION CORRECTA,prediction,asignacion,confirmacion,cancelacion,agente
0,querer una cita de medicina general,asignacion,asignacion,0.298111,0.213094,0.267097,0.221698
1,necesitar una cita de cirugia de columna,asignacion,asignacion,0.312397,0.222303,0.265625,0.199675
2,hago una visita de cirugia plastica,asignacion,asignacion,0.294363,0.206079,0.257347,0.242211
3,querer una agenda de gastroenterologia,asignacion,asignacion,0.306917,0.212601,0.243516,0.236966
4,quiero una cita de citologia,asignacion,asignacion,0.281717,0.224195,0.252792,0.241295
5,podria una cita de hematologia pediatrica,asignacion,asignacion,0.299720,0.216631,0.248480,0.235169
6,podria una visita de neurologia adulto,asignacion,asignacion,0.303975,0.216126,0.244342,0.235558
7,podria una agenda de nefrologia pediatrica,asignacion,asignacion,0.296813,0.216674,0.240207,0.246307
8,podria una agenda de cirugia plastica,asignacion,asignacion,0.305958,0.198928,0.246923,0.248191
9,hago una agenda de neurologia adulto,asignacion,asignacion,0.323775,0.223161,0.238533,0.214531


In [11]:
data[data['prediction'] != data['INTENCION CORRECTA']]

Unnamed: 0,PREGUNTAS,INTENCION CORRECTA,prediction,asignacion,confirmacion,cancelacion,agente
15,necesitamos una visita con el doctor de medicina general,asignacion,agente,0.251112,0.207524,0.263126,0.278238
16,puede una cita con el medico de nutricion,asignacion,cancelacion,0.260151,0.225896,0.282042,0.231910
18,querer una consulta con el doctor de citologia,asignacion,agente,0.261003,0.203226,0.257407,0.278364
19,podria una agenda con el medica de reumatologia pediatrica,asignacion,agente,0.259575,0.218815,0.249187,0.272423
20,querer una visita con el doctor de nutricion,asignacion,cancelacion,0.256472,0.201462,0.280770,0.261296
21,quiero una cita con el especialista de neurologia,asignacion,agente,0.263377,0.207194,0.260490,0.268939
23,necesitamos una visita con el cirujano de citologia,asignacion,cancelacion,0.239650,0.218760,0.272565,0.269025
24,podria una visita con el cirujano de cardiologia pediatrica,asignacion,cancelacion,0.243224,0.204534,0.290703,0.261539
25,necesitamos una visita con el cirujana de cardiovascular,asignacion,cancelacion,0.250820,0.214868,0.276792,0.257519
26,queremos una consulta con el doctora de cirugia bariatrica,asignacion,agente,0.270041,0.210873,0.247353,0.271733


In [10]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
print(input)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
output = loss(input, target)
print(output)

tensor([[ 1.4601e-01,  5.7151e-01, -7.0706e-01, -6.6299e-01,  4.3710e-01],
        [-2.7718e+00, -5.4043e-04,  6.6015e-01,  1.1862e+00,  3.9423e-01],
        [-5.3009e-01,  6.1172e-01, -1.4582e-03, -7.4532e-01, -8.5128e-03]],
       requires_grad=True)
tensor([2, 2, 2])
tensor(1.7958, grad_fn=<NllLossBackward>)


In [11]:
model.wv.vector_size

300

In [32]:
weights = torch.FloatTensor(model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)
# Get embeddings for index 1
input = torch.LongTensor([1])
embedding(input)

tensor([[-1.1705,  5.0482,  2.4649,  0.7217, -2.4352,  1.1867, -1.1970, -1.9468,
          1.2807, -1.4742, -0.7545, -0.5262,  4.0642,  0.2096,  0.2314,  0.1138,
         -1.7481,  0.6004,  1.7987,  1.7062, -2.0083, -0.5639,  1.9007, -1.0101,
         -1.2910, -1.0263,  1.1214, -2.2167,  4.6265,  1.2347, -0.7115, -0.4033,
          1.7796, -1.2604,  1.9289, -0.3498, -1.1462,  1.9832, -1.2624, -2.5941,
          2.8553,  0.3324, -1.6055,  2.5291, -1.4236, -2.4208, -3.3017, -0.8893,
         -0.0917, -3.5893, -1.1435,  4.4766, -0.5909,  1.2724,  3.2217, -0.5666,
         -2.2370, -1.5040, -1.5517, -0.3843, -1.6675, -1.1773, -3.9576, -1.1701,
          0.3911,  0.5919, -0.3836, -1.1477,  0.8444,  2.5014,  2.4875, -2.7076,
          0.6172,  2.2056, -0.9598, -1.8545, -0.9006, -0.6951,  2.9650,  0.2486,
          0.7362,  0.1373, -3.1565, -1.1593, -1.7125, -1.0612, -1.4765, -0.2727,
         -0.7277, -2.9582, -1.2254,  0.6468,  0.7515, -4.0886, -1.1736,  1.3893,
         -1.1203,  3.4329,  

In [30]:
weights.shape()

TypeError: 'torch.Size' object is not callable

In [31]:
model.wv.vectors.shape

(517389, 300)

In [34]:
model.wv.get_vector("hola como estas").shape

(300,)