In [1]:
import re
import io
import sys
import spacy
import numpy as np
import pandas as pd

from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric
from gensim.corpora import WikiCorpus
from gensim.models import FastText

from millenlp.helpers import nlp_utils

import torch
from torch import nn
from torchvision import transforms, utils
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence


from tqdm import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

## Base FastText Wiki Data

In [2]:
%% bash
wget https://dumps.wikimedia.org/eswiki/latest/eswiki-latest-pages-articles.xml.bz2

UsageError: Cell magic `%%` not found.


In [11]:
def make_corpus(in_f, out_f):

    """Convert Wikipedia xml dump file to text corpus"""

    output = open(out_f, 'w')
    wiki = WikiCorpus(in_f)

    i = 0
    for text in wiki.get_texts():
        output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
        i = i + 1
        if (i % 10000 == 0):
            print('Processed ' + str(i) + ' articles')
    output.close()
    print('Processing complete!')
    
make_corpus('eswiki-latest-pages-articles.xml.bz2', 'gensim_wiki_corpora')

Processed 10000 articles
Processed 20000 articles
Processed 30000 articles
Processed 40000 articles
Processed 50000 articles
Processed 60000 articles
Processed 70000 articles
Processed 80000 articles
Processed 90000 articles
Processed 100000 articles
Processed 110000 articles
Processed 120000 articles
Processed 130000 articles
Processed 140000 articles
Processed 150000 articles
Processed 160000 articles
Processed 170000 articles
Processed 180000 articles
Processed 190000 articles
Processed 200000 articles
Processed 210000 articles
Processed 220000 articles
Processed 230000 articles
Processed 240000 articles
Processed 250000 articles
Processed 260000 articles
Processed 270000 articles
Processed 280000 articles
Processed 290000 articles
Processed 300000 articles
Processed 310000 articles
Processed 320000 articles
Processed 330000 articles
Processed 340000 articles
Processed 350000 articles
Processed 360000 articles
Processed 370000 articles
Processed 380000 articles
Processed 390000 arti

In [8]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1
file_len('gensim_wiki_corpora')

1325626

In [37]:
def get_batch(n, file, length):
    with open(file) as f:
        batch = []
        for i, l in enumerate(f):
            if l:
                batch += [preprocess_string(l.lower(), [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric])]
                if (i+1)%n==0 or i == length - 1:
                    yield batch
                    batch = []

In [16]:
model = FastText(size=300, workers=4)
data_length = file_len('gensim_wiki_corpora')
batch_len = 10000
print("Total number of articules {}".format(data_length))
for i, batch in enumerate(get_batch(batch_len, 'gensim_wiki_corpora', data_length)):
    print("Training batch {} for the {} articles".format(i+1, (i+1)*batch_len))
    if i:
        model.build_vocab(batch, update=True)
    else:
        model.build_vocab(batch)
        
    model.train(batch, total_examples=len(batch), epochs=model.epochs)

Total number of articules 1325626
Training batch 1 for the 10000 articles
Training batch 2 for the 20000 articles
Training batch 3 for the 30000 articles
Training batch 4 for the 40000 articles
Training batch 5 for the 50000 articles
Training batch 6 for the 60000 articles
Training batch 7 for the 70000 articles
Training batch 8 for the 80000 articles
Training batch 9 for the 90000 articles
Training batch 10 for the 100000 articles
Training batch 11 for the 110000 articles
Training batch 12 for the 120000 articles
Training batch 13 for the 130000 articles
Training batch 14 for the 140000 articles
Training batch 15 for the 150000 articles
Training batch 16 for the 160000 articles
Training batch 17 for the 170000 articles
Training batch 18 for the 180000 articles
Training batch 19 for the 190000 articles
Training batch 20 for the 200000 articles
Training batch 21 for the 210000 articles
Training batch 22 for the 220000 articles
Training batch 23 for the 230000 articles
Training batch 24 

In [21]:
model.save("model/basefft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [19]:
len(model.wv.vocab)

517213

## Subred Model Complemented

In [2]:
model = FastText.load("model/basefft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [45]:
data_length = file_len('subred.txt')
batch_len = 10000
print("Total number of articules {}".format(data_length))
for i, batch in enumerate(get_batch(batch_len, 'subred.txt', data_length)):
    print("Training batch {} for the {} articles".format(i+1, (i+1)*batch_len))
    model.build_vocab(batch, update=True)
    model.train(batch, total_examples=len(batch), epochs=model.epochs)

Total number of articules 50004
Training batch 1 for the 10000 articles
Training batch 2 for the 20000 articles
Training batch 3 for the 30000 articles
Training batch 4 for the 40000 articles
Training batch 5 for the 50000 articles
Training batch 6 for the 60000 articles


In [46]:
len(model.wv.vocab)

517389

In [47]:
model.save("subred_model/subredfft")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Model Comparative

### Data

In [2]:
class Embedding(object):
    """Embedding transform"""
    
    def __init__(self, model_loc, D_in):
        """
        Args:
            model_loc (string): Path to the embedding FastText
            D_in (intenger): 
        """
        self.D_in = D_in
        self.embedding_layer = FastText.load(model_loc)  
        self.size = self.embedding_layer.wv.vector_size
        
    def __call__(self, sample):
        tensor = torch.zeros(self.D_in, self.size)
        preprocessed_text = preprocess_string(sample['text'].lower(), [strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric])
        preprocessed_text = [word for word in preprocessed_text if word in self.embedding_layer.wv]
        embedding = torch.tensor(self.embedding_layer.wv[preprocessed_text])
        tensor[:embedding.shape[0], :] = embedding[:embedding.shape[0] if embedding.shape[0] <= self.D_in else self.D_in]
        sample['tensor'] = tensor
        return sample


In [3]:
class Flatten(object):
    """Flatten transform"""
    
    def __init__(self):
        pass
    
    def __call__(self, sample):
        sample['tensor'] = sample['tensor'].view(-1)
        return sample

In [4]:
class IterationsDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset_loc, indices, transform=None):
        """
        Args:
            dataset_loc (string): Path to the xlsx file with texts and annotations.
            indices (list): Indices used to take the iteration and label columns from the DataFrame [Iterations, Labels]
        """
        self.indices = indices
        self.dataset = pd.read_excel(dataset_loc)[indices]
        self.transform = transform
        self.labels =  self.dataset[self.indices[1]].unique().tolist()

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        
        sample = {'text': self.dataset.loc[idx, self.indices[0]], 'label': torch.tensor(self.labels.index(self.dataset.loc[idx, self.indices[1]]))}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample

In [7]:
print('MEAN: {:.2f}'.format(pd.read_excel('interacciones.xlsx')['PREGUNTAS'].apply(lambda x : len(x.split())).mean()))
print('STD: {:.2f}'.format(pd.read_excel('interacciones.xlsx')['PREGUNTAS'].apply(lambda x : len(x.split())).std()))

MEAN: 11.26
STD: 9.55


### Linear Model

In [8]:
class FeedForward(nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(FeedForward, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    
    def forward(self, input_tensor):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(input_tensor).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [9]:
D_in, H, D_out, size = 30, 160, 4, 300

# Dataset object with transforms include
dataset_ff = IterationsDataset('interacciones.xlsx', 
                            ['PREGUNTAS', 'INTENCION CORRECTA'], 
                            transform=transforms.Compose([
                                Embedding("subred_model/subredfft", 30),
                                Flatten()
                            ]))

# Generator to group the batches
dataloader_ff = DataLoader(dataset_ff, batch_size=30,
                        shuffle=True, num_workers=4)

model_ff = FeedForward(D_in*size, H, D_out)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_ff.parameters(), lr=1e-4)

for i_batch, batch in enumerate(dataloader_ff):
    # Forward pass: Compute predicted y by passing x to the model
    preds = model_ff(batch['tensor'])

    # Compute and print loss
    training_loss = criterion(preds, batch['label'])
    print(i_batch, training_loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    training_loss.backward()
    optimizer.step()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


0 1.4042294025421143
1 1.4058568477630615
2 1.449819564819336
3 1.3982871770858765
4 1.3552578687667847
5 1.406897783279419
6 1.3978780508041382
7 1.3766382932662964
8 1.3684566020965576
9 1.337425947189331
10 1.348105788230896
11 1.4772683382034302
12 1.386216640472412
13 1.393713116645813
14 1.4159977436065674
15 1.3843861818313599
16 1.382529616355896
17 1.4075191020965576
18 1.3519096374511719
19 1.4062838554382324
20 1.3856126070022583
21 1.3987648487091064
22 1.4003297090530396
23 1.436450719833374
24 1.4330965280532837
25 1.440019130706787
26 1.3902561664581299
27 1.338220477104187
28 1.410431981086731
29 1.3397382497787476
30 1.398729681968689
31 1.331810712814331
32 1.3771013021469116
33 1.3843419551849365
34 1.3617514371871948
35 1.3462318181991577
36 1.3882147073745728
37 1.3633754253387451
38 1.4217829704284668
39 1.3489985466003418
40 1.3843903541564941
41 1.3268400430679321
42 1.3580445051193237
43 1.3138052225112915
44 1.371559739112854
45 1.4058220386505127
46 1.3193024

In [10]:
def predict(text):
    iteration = dataset_ff.transform({'text' : text})
    softmax = nn.Softmax(0)
    pred = softmax(model_ff(iteration['tensor']))
    _, index = torch.max(pred, 0)
    probas = {label : pred[i].item() for i, label in enumerate(dataset_ff.labels)}
    return dataset_ff.labels[index], probas
    
predict('Necesito una cita de medicina general')

('asignacion',
 {'asignacion': 0.32192131876945496,
  'confirmacion': 0.2451360821723938,
  'cancelacion': 0.24301505088806152,
  'agente': 0.18992750346660614})

In [36]:
def predict_dataset(row):
    prediction, probas = predict(row['PREGUNTAS'])
    return [prediction] + [probas[key] for key in probas]

data = dataset_ff.dataset.copy()
data[['prediction', 'asignacion', 'confirmacion', 'cancelacion', 'agente']] = data.progress_apply(lambda x : pd.Series(predict_dataset(x)), axis=1)
data

100%|██████████| 3324/3324 [00:03<00:00, 846.23it/s] 


Unnamed: 0,PREGUNTAS,INTENCION CORRECTA,prediction,asignacion,confirmacion,cancelacion,agente
0,querer una cita de medicina general,asignacion,cancelacion,0.276776,0.213397,0.306140,0.203686
1,necesitar una cita de cirugia de columna,asignacion,cancelacion,0.282482,0.211779,0.298101,0.207638
2,hago una visita de cirugia plastica,asignacion,cancelacion,0.271764,0.219643,0.297868,0.210725
3,querer una agenda de gastroenterologia,asignacion,cancelacion,0.249616,0.227401,0.329370,0.193613
4,quiero una cita de citologia,asignacion,cancelacion,0.271841,0.224352,0.295451,0.208357
5,podria una cita de hematologia pediatrica,asignacion,cancelacion,0.293130,0.204134,0.297776,0.204960
6,podria una visita de neurologia adulto,asignacion,asignacion,0.292333,0.220518,0.289017,0.198132
7,podria una agenda de nefrologia pediatrica,asignacion,cancelacion,0.287619,0.215654,0.303848,0.192880
8,podria una agenda de cirugia plastica,asignacion,cancelacion,0.290129,0.218587,0.297768,0.193516
9,hago una agenda de neurologia adulto,asignacion,cancelacion,0.262271,0.240946,0.311938,0.184845


### GRU MODEL

In [12]:
class GRULayerNet(nn.Module):
    def __init__(self, H, D_out, size):
        """
        Args:
            H : Number of hidden states
        """   

        super(GRULayerNet, self).__init__()
        self.D_in = D_in
        self.size = size
        self.gru =  nn.GRU(self.size, H, 1, batch_first=True, bidirectional=True)
        self.feedforward = FeedForward(H, int(H/2), D_out)
    
    def forward(self, input_tensor):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        _, laststate = self.gru(input_tensor)
        output = self.feedforward(laststate[1,:,:].view(-1,H))
        return output

In [16]:
D_in, H, D_out, size = 30, 160, 4, 300

# Dataset object with transforms include
dataset_GRU = IterationsDataset('interacciones.xlsx', 
                            ['PREGUNTAS', 'INTENCION CORRECTA'], 
                            transform=transforms.Compose([
                                Embedding("subred_model/subredfft", 30)
                            ]))

dataloader_GRU = DataLoader(dataset_GRU, batch_size=40,
                        shuffle=True, num_workers=4)


model_GRU = GRULayerNet(H, D_out, size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model_GRU.parameters(), lr=1e-4)
softmax = nn.Softmax(1)

for i_batch, batch in enumerate(dataloader_GRU):

    # Forward pass: Compute predicted y by passing x to the model
    preds = model_GRU.forward(batch['tensor'])

    # Compute cross entropy loss    
    loss = criterion(preds, batch['label']) 
    
    # Compute training loss
    _, training_preds = torch.max(softmax(preds), 1)
    equality = training_preds ==  batch['label']
    training_loss = 
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(i_batch, loss.item())
    print()
    print('\t', batch['text'][0])
    print('\t', batch['label'][0].item())
    print('\t', preds[1, :].tolist())
    print('\t', equality)
    print()
    
    
#     softmax = nn.Softmax(1)
#     pred = softmax(model_GRU(iteration['tensor'].view(1,iteration['tensor'].size()[0],-1)))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


0 1.381702184677124

	 Cancelar cita
	 2
	 [0.113007552921772, -0.13699156045913696, 0.12125793099403381, -0.07387614995241165]
	 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0,
        1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0], dtype=torch.uint8)

1 1.3998628854751587

	 para el domingo veintinueve de julio debo asignar en el fontibon el cirujana Perez Rodriguez Diego Andres
	 0
	 [-0.0001614987850189209, -0.005154334008693695, 0.2415342628955841, -0.005832705646753311]
	 tensor([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0], dtype=torch.uint8)

2 1.350871205329895

	 Necesito una cita de otorrinolaringologia
	 0
	 [-0.0713353082537651, -0.019042644649744034, -0.00884576141834259, -0.0829278826713562]
	 tensor([0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1], dtype=torch.uint8)

3 1.33426

27 1.3847508430480957

	 consulta con procedimiento policiales en medicina general
	 0
	 [-0.09470529109239578, -0.1311555951833725, 0.06643153727054596, -0.0677560567855835]
	 tensor([1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0], dtype=torch.uint8)

28 1.3893648386001587

	 mi hija identificado con la pasaporte 338 076470 tiene una visita con el medico Pena del pablo sexto de bosa pero hay que no puedo ir porque tiene gripa
	 2
	 [-0.03053916245698929, -0.08510582894086838, 0.05003749206662178, -0.07204196602106094]
	 tensor([1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.uint8)

29 1.366105318069458

	 cita infectologia
	 0
	 [-0.07518868893384933, 0.03855255991220474, 0.0344342403113842, -0.1346050202846527]
	 tensor([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 

52 1.3578035831451416

	 my general
	 3
	 [-0.018533233553171158, -0.01973113790154457, 0.09652441740036011, -0.0665353536605835]
	 tensor([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
        1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0], dtype=torch.uint8)

53 1.372925043106079

	 para reumatologia pues padezco artritis reumatoide a la especialidad reumatologia
	 0
	 [0.002643957734107971, -0.09185982495546341, 0.01509464904665947, -0.053225740790367126]
	 tensor([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
        1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], dtype=torch.uint8)

54 1.3708473443984985

	 estoy necesitando una consulta por especializada en dermatologia para control
	 0
	 [0.012589674443006516, 0.0006104260683059692, 0.1338508129119873, 0.040727242827415466]
	 tensor([0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0], dtype=torch.uin

79 1.3714954853057861

	 en una cita medica
	 3
	 [-0.006156262010335922, -0.05167310684919357, -0.010052586905658245, -0.10698797553777695]
	 tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1], dtype=torch.uint8)

80 1.3774681091308594

	 en el hospital el tunal el 27 marzo yo tengo una cupo podrias certificar la hora, mi cedula de extranjeria es la numero 1172311068
	 1
	 [-0.0005152411758899689, 0.00855632871389389, 0.104893758893013, -0.06340007483959198]
	 tensor([0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0], dtype=torch.uint8)

81 1.359761118888855

	 hace algunos dias me dieron una visita el catorce de septiembre en el simon bolivar con el medica Perez Rodriguez Diego Andres, pero resulta que no alcanco a llegar a esta hora
	 2
	 [-0.024693744257092476, -0.10093675553798676, 0.013364359736442566, -0.07693197578191

In [77]:
def error(x, class_):
    x = np.array(x)
    error = np.log(np.sum(np.exp(x))) - x[class_]
    return error
    
error([-0.013613969087600708, 0.10951884090900421, -0.13679464161396027, 0.11933265626430511], 0)

1.4248724570904603

In [58]:
def predict(text):
    iteration = dataset_GRU.transform({'text' : text})
    softmax = nn.Softmax(1)
    print(model_GRU(iteration['tensor'].view(1,iteration['tensor'].size()[0],-1)))
    pred = softmax(model_GRU(iteration['tensor'].view(1,iteration['tensor'].size()[0],-1)))
    print(pred)
    _, index = torch.max(pred, 1)
    probas = {label : pred[0,i].item() for i, label in enumerate(dataset_GRU.labels)}
    return dataset_GRU.labels[index], probas
    
predict('Necesito una cita de medicina general')

tensor([[-0.0460,  0.0141,  0.1327, -0.0780]], grad_fn=<AddmmBackward>)
tensor([[0.2366, 0.2513, 0.2829, 0.2292]], grad_fn=<SoftmaxBackward>)


('cancelacion',
 {'asignacion': 0.2366206794977188,
  'confirmacion': 0.2512859106063843,
  'cancelacion': 0.282930850982666,
  'agente': 0.22916249930858612})

In [15]:
def predict_dataset(row):
    prediction, probas = predict(row['PREGUNTAS'])
    return [prediction] + [probas[key] for key in probas]

data = dataset.dataset.copy()
data[['prediction', 'asignacion', 'confirmacion', 'cancelacion', 'agente']] = data.progress_apply(lambda x : pd.Series(predict_dataset(x)), axis=1)
data

100%|██████████| 3324/3324 [00:24<00:00, 136.13it/s]


Unnamed: 0,PREGUNTAS,INTENCION CORRECTA,prediction,asignacion,confirmacion,cancelacion,agente
0,querer una cita de medicina general,asignacion,agente,1.0,1.0,1.0,1.0
1,necesitar una cita de cirugia de columna,asignacion,agente,1.0,1.0,1.0,1.0
2,hago una visita de cirugia plastica,asignacion,agente,1.0,1.0,1.0,1.0
3,querer una agenda de gastroenterologia,asignacion,agente,1.0,1.0,1.0,1.0
4,quiero una cita de citologia,asignacion,agente,1.0,1.0,1.0,1.0
5,podria una cita de hematologia pediatrica,asignacion,agente,1.0,1.0,1.0,1.0
6,podria una visita de neurologia adulto,asignacion,agente,1.0,1.0,1.0,1.0
7,podria una agenda de nefrologia pediatrica,asignacion,agente,1.0,1.0,1.0,1.0
8,podria una agenda de cirugia plastica,asignacion,agente,1.0,1.0,1.0,1.0
9,hago una agenda de neurologia adulto,asignacion,agente,1.0,1.0,1.0,1.0


In [11]:
data[data['prediction'] != data['INTENCION CORRECTA']]

Unnamed: 0,PREGUNTAS,INTENCION CORRECTA,prediction,asignacion,confirmacion,cancelacion,agente
15,necesitamos una visita con el doctor de medicina general,asignacion,agente,0.251112,0.207524,0.263126,0.278238
16,puede una cita con el medico de nutricion,asignacion,cancelacion,0.260151,0.225896,0.282042,0.231910
18,querer una consulta con el doctor de citologia,asignacion,agente,0.261003,0.203226,0.257407,0.278364
19,podria una agenda con el medica de reumatologia pediatrica,asignacion,agente,0.259575,0.218815,0.249187,0.272423
20,querer una visita con el doctor de nutricion,asignacion,cancelacion,0.256472,0.201462,0.280770,0.261296
21,quiero una cita con el especialista de neurologia,asignacion,agente,0.263377,0.207194,0.260490,0.268939
23,necesitamos una visita con el cirujano de citologia,asignacion,cancelacion,0.239650,0.218760,0.272565,0.269025
24,podria una visita con el cirujano de cardiologia pediatrica,asignacion,cancelacion,0.243224,0.204534,0.290703,0.261539
25,necesitamos una visita con el cirujana de cardiovascular,asignacion,cancelacion,0.250820,0.214868,0.276792,0.257519
26,queremos una consulta con el doctora de cirugia bariatrica,asignacion,agente,0.270041,0.210873,0.247353,0.271733


In [10]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
print(input)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target)
output = loss(input, target)
print(output)

tensor([[ 1.4601e-01,  5.7151e-01, -7.0706e-01, -6.6299e-01,  4.3710e-01],
        [-2.7718e+00, -5.4043e-04,  6.6015e-01,  1.1862e+00,  3.9423e-01],
        [-5.3009e-01,  6.1172e-01, -1.4582e-03, -7.4532e-01, -8.5128e-03]],
       requires_grad=True)
tensor([2, 2, 2])
tensor(1.7958, grad_fn=<NllLossBackward>)


In [11]:
model.wv.vector_size

300

In [32]:
weights = torch.FloatTensor(model.wv.vectors)
embedding = nn.Embedding.from_pretrained(weights)
# Get embeddings for index 1
input = torch.LongTensor([1])
embedding(input)

tensor([[-1.1705,  5.0482,  2.4649,  0.7217, -2.4352,  1.1867, -1.1970, -1.9468,
          1.2807, -1.4742, -0.7545, -0.5262,  4.0642,  0.2096,  0.2314,  0.1138,
         -1.7481,  0.6004,  1.7987,  1.7062, -2.0083, -0.5639,  1.9007, -1.0101,
         -1.2910, -1.0263,  1.1214, -2.2167,  4.6265,  1.2347, -0.7115, -0.4033,
          1.7796, -1.2604,  1.9289, -0.3498, -1.1462,  1.9832, -1.2624, -2.5941,
          2.8553,  0.3324, -1.6055,  2.5291, -1.4236, -2.4208, -3.3017, -0.8893,
         -0.0917, -3.5893, -1.1435,  4.4766, -0.5909,  1.2724,  3.2217, -0.5666,
         -2.2370, -1.5040, -1.5517, -0.3843, -1.6675, -1.1773, -3.9576, -1.1701,
          0.3911,  0.5919, -0.3836, -1.1477,  0.8444,  2.5014,  2.4875, -2.7076,
          0.6172,  2.2056, -0.9598, -1.8545, -0.9006, -0.6951,  2.9650,  0.2486,
          0.7362,  0.1373, -3.1565, -1.1593, -1.7125, -1.0612, -1.4765, -0.2727,
         -0.7277, -2.9582, -1.2254,  0.6468,  0.7515, -4.0886, -1.1736,  1.3893,
         -1.1203,  3.4329,  

In [30]:
weights.shape()

TypeError: 'torch.Size' object is not callable

In [31]:
model.wv.vectors.shape

(517389, 300)

In [34]:
model.wv.get_vector("hola como estas").shape

(300,)