Environnement
=============

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("../input"))

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field,ReversibleField,TabularDataset,Iterator, BucketIterator



['SICK_dev_logistic.txt', 'SICK_test.txt', 'SICK_train_logistic.txt']


Lecture des données
===================
Pour de plus amples informations, on recommande la consultation de :
https://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/ 
et de:
https://torchtext.readthedocs.io/en/latest/

In [2]:
def tokenize(text):
    return text.split()

def proc_float(value):
    return float(value)

def proc_int(value):
    return int(value)

TEXT      = Field(sequential=True, tokenize=tokenize) #might alternatively specify cuda data types to get the dataset to live permanently on the GPU
FLOAT     = Field(sequential=False, use_vocab=False,dtype=torch.float,preprocessing=proc_float) 
INTEGER   = Field(sequential=False, use_vocab=False,preprocessing=proc_int)

df         = TabularDataset("../input/SICK_train_logistic.txt","tsv",skip_header=True,\
                            fields=[('idx',INTEGER),('sentA',TEXT),('sentB',TEXT),('Relatedness',FLOAT)])
df_train,df_dev  = df.split(split_ratio=0.8)
#TEXT.build_vocab(df_train)
#glove
TEXT.build_vocab(df_train,vectors='glove.6B.100d')

#Prints out the first few lines of the dataset
for elt in df_dev[:10]: #prints out the ten first examples
    print(elt.idx,' '.join(elt.sentA),'||',' '.join(elt.sentB),elt.Relatedness)
    
df_test = TabularDataset("../input/SICK_test.txt","tsv",skip_header=True,fields=[('idx',INTEGER),('sentA',TEXT),('sentB',TEXT)])
for elt in df_test[-10:]: #prints out the ten first examples
    print(elt.idx,' '.join(elt.sentA),'||',' '.join(elt.sentB))
print(len(df_test))

.vector_cache/glove.6B.zip: 862MB [06:28, 2.22MB/s]                          
100%|█████████▉| 399747/400000 [00:29<00:00, 13719.74it/s]

8108 A topless woman is being covered in mud || A topless woman is being smeared with a brown substance and a blurry crowd is in the background 0.76
6160 A race car driver is standing up and pointing his hand at the sky || The person in the blue jacket is wearing a colorful helmet 0.36
1670 A boat is sailing peacefully over the water || The guitar is being played by a man 0.2
415 Two men with bikes are on the side of a snowy road || Two men with cars are on the side of a snowy road 0.7
7511 There are no dogs fighting for a frisbee in a lake || Two brown dogs are playing with a frisbee in the water 0.78
3777 A woman is dancing and a man is playing the keyboard || The woman is dancing and a man is playing the keyboard 0.9800000000000001
3574 Paper is being cut with scissors || The piece of paper is being cut 0.8800000000000001
7213 A brown dog is running breathlessly across the yard with a toy in its mouth || A brown dog is running across the yard with a toy in its mouth 0.9
4340 A man i

Classification
==============

In [3]:
class ParaphraseClassifier(nn.Module):
    
    def __init__(self,hidden_dim,embedding_dim):

        super(ParaphraseClassifier, self).__init__()
        
        self.hidden_dim    = hidden_dim
        self.embedding_dim = embedding_dim
        self.embedding     = nn.Embedding(len(TEXT.vocab), embedding_dim)
        self.embedding.weight.data.copy_(TEXT.vocab.vectors)
        self.embedding.weight.requires_grad=False
        self.lstm          = nn.LSTM(embedding_dim, hidden_dim, num_layers=1,bidirectional=False)
        self.Wadd          = nn.Linear(hidden_dim,hidden_dim)   
        self.Wtimes        = nn.Linear(hidden_dim,hidden_dim)
        self.Wout          = nn.Linear(hidden_dim,1)


    def forward(self,xinputA,xinputB):
        """
        Args:
            xinputA is a sequence of word indexes
            xinputB is a sequence of word indexes
        The forward method also works for batched input.       
        """
        ##details for dimensionalities
        #embeddings
        #  input : batch_size x seq_length
        #  output: batch-size x seq_length x embedding_dimension
        #lstm
        #  input : seq_length x batch_size x embedding_size
        #  output: seq_length x batch_size x hidden_size  (for the sequence)
        #  output: batch_size x hidden_size (for the last hidden/cell state)
                
        xembeddedA                       = self.embedding(xinputA)                                                #catches embedding vectors
        lstm_outA, (hiddenA,cellA)       = self.lstm(xembeddedA.view(len(xinputA), -1, self.embedding_dim), None) #-1 is a wildcard (here we let pytorch guess batch size)

        xembeddedB                       = self.embedding(xinputB)                                                #catches embedding vectors
        lstm_outB, (hiddenB,cellB)       = self.lstm(xembeddedB.view(len(xinputB), -1, self.embedding_dim), None)

        #hiddenA = hiddenA.view(-1,self.hidden_dim * 2)
        #hiddenB = hiddenB.view(-1,self.hidden_dim * 2)       
        #merge sentence representations
        hiddenT = hiddenA * hiddenB
        hiddenD = torch.abs(hiddenA - hiddenB)
        hidden  = torch.tanh(self.Wtimes(hiddenT) + self.Wadd(hiddenD))
        return torch.sigmoid(self.Wout(hidden))

    def run_train(self,train_set,dev_set,epochs,learning_rate=0.001):

        loss_func  = nn.BCELoss() 
        optimizer  = optim.Adam(self.parameters(), lr=learning_rate)
                
        train_iterator,dev_iterator = BucketIterator.splits((train_set,dev_set),batch_sizes=(64,1),device=-1,sort_key=lambda x: len(x.sentA),sort_within_batch=False,repeat=False)
       
        bestloss=1
        for e in range(epochs):
            for batch in train_iterator: 
                xvecA,xvecB,yRelness = batch.sentA,batch.sentB,batch.Relatedness
                self.zero_grad()
                prob            = self.forward(xvecA,xvecB).squeeze()
                loss            = loss_func(prob,yRelness)
                loss.backward()
                optimizer.step()
            for batchdev in dev_iterator: 
                xvecA,xvecB,yRelness = batchdev.sentA,batchdev.sentB,batchdev.Relatedness
                pred= self.forward(xvecA,xvecB).squeeze().view(1)
                lossnow = loss_func(pred,yRelness)
                #print(pred.shape,yRelness.shape,"dev")
                if (lossnow < bestloss) :
                    bestloss=lossnow
                    torch.save(self.state_dict(),"params.pth")
        self.load_state_dict(torch.load("params.pth"))
        
    def run_test(self,test_set) : 
        self.eval()
        self.load_state_dict(torch.load("params.pth"))
        test_iterator = Iterator(test_set, batch_size=1, device=-1, sort=False, sort_within_batch=False, repeat=False)
        pred=[]
        idx=[]
        for batch in test_iterator: 
                xvecA,xvecB = batch.sentA,batch.sentB
                pred.append((self.forward(xvecA,xvecB).squeeze().item()/2)*10)
                idx.append(batch.idx.item())
        submission = pd.DataFrame({"pairID" : idx ,"Relatedness" : pred})
        #enregistre les preds
        filename = 'DugrainCharlotte_SICK.csv'
        submission.to_csv(filename,index=False)
        return pred,submission

pc = ParaphraseClassifier(150,100)
pc.run_train(df_train,df_dev,25)

100%|█████████▉| 399747/400000 [00:40<00:00, 13719.74it/s]

In [4]:
pred,submission=pc.run_test(df_test)

In [5]:
submission.head()

Unnamed: 0,pairID,Relatedness
0,3698,4.151008
1,5731,4.08758
2,6055,4.428403
3,2880,4.686611
4,7536,4.726976
