importer données rotten tomatoes

In [1]:
import numpy as np
import torch
import torchvision
import unicodedata
import string
import re
import random
import pickle as pkl
from torch import nn
from torch.autograd import Variable


#load RottenTomatoes sentiment analysis dataset
def load_tsv(filename):
    phID, stcID, ph, sentiment = [],[],[],[]
    with open(filename) as f: 
        l = f.readline()
        if (len(l.split("\t")) == 4):
            '''TRAIN dataset'''
            for line in f:
                l = line.split("\t")
                if (int(l[3][0]) < 2):
                    phID.append(l[0]), stcID.append(l[1]), ph.append(l[2]), sentiment.append(0)
                else:
                    phID.append(l[0]), stcID.append(l[1]), ph.append(l[2]), sentiment.append(1)
        else:
            for line in f:
                l = line.split("\t")
                phID.append(l[0]), stcID.append(l[1]), ph.append(l[2])
                
    return np.array(phID).astype(int), np.array(stcID).astype(int), np.array(ph), np.array(sentiment).astype(int)

#make validation dataset
def split_train_test(data, percentage=80):
    size = len(data)
    indexes = np.arange(0,size)
    np.random.shuffle(indexes)
    
    train_indexes = indexes[0:int(size*percentage/100)]
    test_indexes = indexes[int(size*percentage/100):-1]
    
    return train_indexes, test_indexes
    
phraseID, sentenceID, sentences, sentiment = load_tsv("SKIM-RNN/train.tsv")
train_indexes, test_indexes = split_train_test(phraseID)

Module de preprocessing

In [23]:
'''Trim, Store, Count, Index words from dataset'''

SOS_token = 0
EOS_token = 1

class Preprocesser():
    
    '''main functions'''
    def __init__(self,corpus):
        '''corpus : np_array(string)'''
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS
        self.corpus = corpus
        self.size = len(corpus)
    
    # Lowercase, trim, and remove non-letter characters
    # (no stop words in skim rnn ?)
    def normalize(self):
        new_corpus = np.array([])
        steps,i = np.arange(0,self.size,self.size/10),0
        for s in self.corpus:
            uni_s = s.tostring().decode('unicode-escape')
            uni_s = self.unicodeToAscii(uni_s.lower().strip())
            uni_s = re.sub(r"([.!?])", r" \1", uni_s)
            uni_s = re.sub(r"[^a-zA-Z.!?]+", r" ", uni_s)
            new_corpus = np.append(new_corpus, uni_s)
            if (i in steps):
                print ("...")
            i+=1
        self.corpus = new_corpus


    def addSentences(self):
        for sentence in self.corpus:
            self.addSentence(sentence)
        
    '''called within the module:'''
    def unicodeToAscii(self,s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )

        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
            
            
    def save(self,filename):
        pkl.dump(self.corpus,open(filename+"_corpus.pkl",'wb'))
        pkl.dump(self.word2count,open(filename+"_w2c.pkl",'wb'))
        pkl.dump(self.word2index,open(filename+"_w2i.pkl",'wb'))
        pkl.dump(self.index2word,open(filename+"_i2w.pkl", 'wb'))
    
    def load(self, filename):
        self.corpus = pkl.load(open(filename+"_corpus.pkl", 'rb'))
        self.word2count = pkl.load(open(filename+"_w2c.pkl", 'rb'))
        self.word2index = pkl.load(open(filename+"_w2i.pkl", 'rb'))
        self.index2word = pkl.load(open(filename+"_i2w.pkl", 'rb'))
        self.size = len(self.corpus)
        self.n_words = len(self.word2count.keys())
        
 

In [24]:
#preprocesser = Preprocesser(sentences)
#preprocesser.normalize()
#preprocesser.addSentences()

#preprocesser.save("preprocessing_IMDB")


preprocesser = Preprocesser({})
preprocesser.load("preprocessing_IMDB")

Modules RNN

In [145]:
'''RNN module : embedding layer -> gru layer -> linear layer (for output classification) -> softmax'''

class RNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(input_size, hidden_size)
        
        self.h20 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        #embedded = self.embedding(input).view(1 , 1, -1)
        #output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(input.view(1, 1, -1), hidden)
            output = self.h20(output)
            output = self.softmax(output.view(1,-1))
            #lstm - > (input, (hidden,c <- ???) , cf pytorch doc, 'initial cell state')
        return output, hidden
    
    def initHidden(self):
        return Variable(torch.rand(1, 1, self.hidden_size))
    
    
    
class SkimRNN(nn.Module):
    
    def __init__(self, input_size, d, dprime, output_size, k  ):
        super(SkimRNN, self).__init__()
        self.softmax_p = nn.LogSoftmax()
        self.linear_p = nn.Linear(input_size+d, k)
        self.mainRNN = RNN(input_size, d, output_size)
        self.smallRNN = RNN(input_size, dprime, output_size)
        self.d = d
        self.dprime = dprime
        
    def forward(self, input, hidden):
        cat = torch.cat( (input.view(1, 1, -1), hidden), 2)
        linp = self.linear_p(cat)
        p = self.softmax_p(self.linear_p(cat).view(1,-1))
        q = p.exp().multinomial()
        print q.data[0,0]
        if q.data[0,0]==0:
            #main RNN
            output, hidden = self.mainRNN(input, hidden)
        else:
            #small RNN
            output, h = self.smallRNN(input, hidden)
            hidden = torch.cat( (h, hidden[self.dprime+1:-1]), 2)
        return output, p, hidden
        
    def initHidden(self):
        return Variable(torch.rand(1, 1, self.d))
    
        
    
    

Fonctions pour la gestion des inputs 

In [105]:
def binarize(data, n_words, neg_value=0):
    """
    encode target en one-hot, si neg_value 
    vaut zéro, ou en -1/1, si neg_value vaut
    -1 par exemple
    """
    y_onehot = torch.FloatTensor(n_words)
    y_onehot.zero_().add_(neg_value)
    return y_onehot.scatter_(0, data, 1)

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def variableFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = torch.LongTensor(indexes).view(-1, 1)
    return result


def makeInputTarget(lang, sentence, target, n_classes=2):
    input_variable = variableFromSentence(lang, sentence)
    if target >= n_classes:
        print 'target not in range (0, #classes - 1)'
        return -1
    target_variable = Variable(torch.LongTensor([target]))
    return (input_variable, target_variable)


Fonctions pour l'apprentissage

In [186]:
def gumbel():
    return -torch.log(-torch.log(torch.FloatTensor(2).uniform_()))

def r(logp, g, temperature):
    num = [torch.exp( (pi + gi)/temperature) for (pi,gi) in zip(logp,g)]
    denum = torch.sum(torch.exp( (logp[0].data + g)/temperature))
    return [n.data/denum for n in num]


def train(preprocesser, input,target,rnn, optimizer, criterion, temperature):
    hidden = rnn.initHidden()

    rnn.zero_grad()
    input_length = input.size()[0]
    logp = []
    rp = []
    loss = 0
    
    for t in range(input_length):
        input = binarize(input[t], preprocesser.n_words)
        
        
        output, p, hidden = rnn(Variable(input), hidden)
        logp.append(p)
    
        g = gumbel()    
        rp.append(r(logp,g,temperature))
    
        c = criterion(output.view(1,-1), target)
        print c.data
        print np.prod(rp[t])
        loss += criterion(output.view(1,-1), target).data * np.prod(rp[t])
    loss += (gamma/input_length) * np.sum(logp[:,1])
    loss.backward()

    optimizer.step()
    return output ,loss.data[0]





Hyperparamètres + apprentissage

In [187]:
from torch import optim
rnn =  SkimRNN(preprocesser.n_words, 128, 5, 2, 2)
criterion = nn.NLLLoss()
optimizer = optim.Adam(rnn.parameters(), lr=1e-2)
epoch,mod = 0,10
losses = []
accs = []

for i in train_indexes[:10]:
    
    s,t =preprocesser.corpus[i], sentiment[i]
    input,target =  makeInputTarget(preprocesser, s, t)
    output, l = train(preprocesser,input,target,rnn,optimizer,criterion,1)
    
    losses.append(l)



0

 0.5917
[torch.FloatTensor of size 1]


 0.2754  0.2879
[torch.FloatTensor of size 1x2]



TypeError: scatter_ received an invalid combination of arguments - got (int, float, int), but expected one of:
 * (int dim, torch.LongTensor index, float value)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [31;1mfloat[0m, [32;1mint[0m)
 * (int dim, torch.LongTensor index, torch.FloatTensor src)
      didn't match because some of the arguments have invalid types: ([32;1mint[0m, [31;1mfloat[0m, [31;1mint[0m)


# TODO

- embedded ? à supprimer ou à faire dans skim rnn plutôt que dans les RNN
- debug petit RNN (hidden state vs GRU)
- mieux gérer one hot (mot courant)
- preprocessing fait façon tuto pytorch, cf GlOVe

