In [1]:
import torch
from torch.utils.serialization import load_lua
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
import re


def code2char(code, vocab):
    vocab_map = dict(zip(vocab.values(), vocab.keys()))
    return "".join(vocab_map[c] for c in code)

def char2code(text, vocab):
    data = torch.ByteTensor(len(text))
    for i, c in enumerate(text):
        data[i] = vocab[c]
    return data


class CharDataset(Dataset):
    """
        Charge un fichier data_file tenseur 1D d'entiers et le decoupe en sequences de longueur seq_length. Vocab_file est un dictionnaire
        de characteres vers entier.
    """
    def __init__(self, data_file, vocab_file, seq_length_entry, seq_length_exit):
        self.data_file = data_file
        self.vocab_file = vocab_file
        self.seq_length_entry = seq_length_entry
        self.seq_length_exit = seq_length_exit
        self.data = torch.load(data_file)
        self.vocab = torch.load(vocab_file)
        self.vocab_map = dict(zip(self.vocab.values(), self.vocab.keys()))
        self.nb_samples = len(self.data) - self.seq_length_exit - self.seq_length_entry +1
        #print('cutting off end of data so that the batches/sequences divide evenly')
        #self.data = self.data[:(self.seq_length_exit * self.nb_samples + 1)]
        self.vocab_size = len(self.vocab)

    def __getitem__(self, index):
        #index = min(index, len(self.data) - self.seq_length_exit - self.seq_length_entry )
        start, end = index, index +  self.seq_length_entry
        return self.data[start:end], self.data[end:(end + self.seq_length_exit)]

    def __len__(self):
        return self.nb_samples



def make_files(text_file, out_tensorfile=None, out_vocabfile=None, vocab_file=None):
    """ Permet de creer a partir d'un fichier texte le tenseur 1D encode en entier et le mapping
    des caracteres vers leur code.
    * text_file : le fichier texte brut
    * out_tensorfile : si specifie, sauve le tenseur 1D de sortie
    * out_vocabfile : si specifie, sauve le mapping entre caractere et code
    * vocab_file : si specifie, charge le mapping entre caractere et code
    """
    #with open(text_file) as f:
    #    text = re.sub(r'[^a-zA-Z0-9_\s]', ' ', f.read().lower())
    #    text = re.sub(r'\s+', ' ', text)

    text = re.split('\W+', text_file)

    if vocab_file is not None:
        vocab = torch.load(vocab_file)
    else:
        chars = set(text)
        chars.add(' ')
        chars.add('.')
        chars.add(',')
        vocab = dict(zip(sorted(chars), range(len(chars))))
    data = char2code(text, vocab)
    if out_vocabfile is not None:
        torch.save(vocab, out_vocabfile)
    if out_tensorfile is not None:
        torch.save(data, out_tensorfile)
    return data, vocab


In [2]:
import torch.nn as nn
import torch.nn.functional as F
import torch
from torchvision import datasets, transforms
import torch.autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import time
import math
import numpy.random as rdn


class Neural_net(nn.Module):
    def __init__(self, num_layers, size_list, f):
        super(Neural_net, self).__init__()

        self.num_layers = num_layers

        self.linear = nn.ModuleList([nn.Linear(size_list[i], size_list[i + 1]) for i in range(num_layers)])

        self.f = f

    def forward(self, x):
        for layer in range(self.num_layers):
            x = self.f(self.linear[layer](x))

        return x;


class Encoder(nn.Module):
    def __init__(self, size_h, size_dict, f):
        super(Encoder, self).__init__()
        self.w1 = nn.Linear(size_dict, size_h)
        self.w2 = nn.Linear(size_h, size_h)
        self.f = f

    def forward(self, h, x_i):
        result = self.f(self.w1(x_i) + self.w2(h))
        return result;


class Decoder(nn.Module):
    def __init__(self, size_h, size_dict):
        super(Decoder, self).__init__()
        self.w1 = nn.Linear(size_h, size_dict)

    def forward(self, h):
        result = F.softmax(self.w1(h))
        return result;


class LSTM(nn.Module):
    def __init__(self, size_x, size_y, size_dict, size_h, f):
        super(LSTM, self).__init__()
        self.size_x = size_x
        self.size_y = size_y
        self.size_h = size_h
        self.size_dict = size_dict
        self.f = f

        self.e1 = Encoder(self.size_h, self.size_dict, self.f)
        self.e2 = Encoder(self.size_h, self.size_dict, self.f)
        self.d = Decoder(self.size_h, self.size_dict)

        self.mod_x_1 = nn.Linear(self.size_dict, self.size_h)
        self.mod_x_2 = nn.Linear(self.size_dict, self.size_h)
        self.mod_x_3 = nn.Linear(self.size_dict, self.size_h)
        self.mod_x_4 = nn.Linear(self.size_dict, self.size_h)

        self.mod_h_1 = nn.Linear(self.size_h, self.size_h)
        self.mod_h_2 = nn.Linear(self.size_h, self.size_h)
        self.mod_h_3 = nn.Linear(self.size_h, self.size_h)
        self.mod_h_4 = nn.Linear(self.size_h, self.size_h)

    def forward(self, h, c, mat_xv, mat_yv):
        out = []
        for i in range(self.size_x):
            f = F.sigmoid(self.mod_x_1.forward(mat_xv[:, i]) + self.mod_h_1.forward(h))
            a = F.sigmoid(self.mod_x_2.forward(mat_xv[:, i]) + self.mod_h_2.forward(h))
            t = F.tanh(self.mod_x_3.forward(mat_xv[:, i]) + self.mod_h_3.forward(h))

            c = c.mul(f) + a.mul(t)
            h = F.sigmoid(self.mod_x_2.forward(mat_xv[:, i]) + self.mod_h_2.forward(h)).mul(F.tanh(c))

        for j in range(self.size_y):
            out.append(self.d.forward(h.view(-1)))
            h = self.e2.forward(h.view(-1), mat_yv[:, j])
        return out;

    def forward2(self, h, c, mat_xv):
        out2 = []
        for i in range(self.size_x):
            f = F.sigmoid(self.mod_x_1.forward(mat_xv[:, i]) + self.mod_h_1.forward(h))
            a = F.sigmoid(self.mod_x_2.forward(mat_xv[:, i]) + self.mod_h_2.forward(h))
            t = F.tanh(self.mod_x_3.forward(mat_xv[:, i]) + self.mod_h_3.forward(h))

            c = c.mul(f) + a.mul(t)
            h = F.sigmoid(self.mod_x_2.forward(mat_xv[:, i]) + self.mod_h_2.forward(h)).mul(F.tanh(c))

        for j in range(self.size_y):
            proba = self.d.forward(h.view(-1))
            pred = torch.multinomial(proba, 1, replacement=True)[0]
            out2.append(pred.data.numpy()[0])
            # tirage par rapport à
            h = self.e2.forward(h.view(-1), proba)
        return out2;




In [8]:
import torch.autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import random
import time
import numpy.random as rdn
import re

# PARAMETERS
T = 4  # taille d'entrée
size_h = 55  # taille de l'espace latent
batch_size = 1
n = 1  # taille de sortie

text_file = "/Users/melkigabriel/Desktop/textforrnn.txt.rtf"

with open(text_file) as f:
    text_test = re.sub(r'[^a-zA-Z0-9_\s]', ' ', f.read().lower())
    text_test = re.sub(r'\s+', ' ', text_test)
text_test=text_test[254:]   
bobo = make_files(text_test, "mon_tenseur.pt", "mon_vocab.pt")

V = len(bobo[1].keys())  # taille du dico

cdset = CharDataset("mon_tenseur.pt", "mon_vocab.pt", T, n)

dataload = DataLoader(cdset, batch_size=batch_size, shuffle=False)

loss = nn.NLLLoss()  # on rentre un logSoftMax

model = LSTM(size_x=T, size_y=n, size_dict=V, size_h=size_h, f=F.tanh)
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)


In [10]:
#APPRENTISSAGE
a = time.time()
i=0

err =Variable(torch.ones(1))
while(time.time() - a < 3*60):
    optimizer.zero_grad()
    err = 0
    for x_train, y_train in dataload:
        mat_x_train = torch.FloatTensor(V, T).fill_(0)
        for j in range(T):
            mat_x_train[x_train[0][j] - 1, j] = 1
        mat_x_train = Variable(mat_x_train)

        y_train = y_train.type(torch.ByteTensor)

        mat_y_train = torch.FloatTensor(V, n).fill_(0)
        for j in range(n):
            mat_y_train[y_train[0][j] - 1, j] = 1
        mat_y_train = Variable(mat_y_train)

        #y_train = Variable(y_train[0])

        h = Variable(torch.zeros(size_h, x_train.size()[0]).view(-1))
        c = Variable(torch.zeros(size_h, x_train.size()[0]).view(-1))
    
        out = model.forward(h,c, mat_x_train, mat_y_train)

        for j in range(n):
            err += loss.forward(torch.log(out[j].view(1, V)), Variable(y_train[0])[j].type(torch.LongTensor) )
    if(i%(int(dataload.dataset.data.size()[0]/100))==0):
        print("=======================")
        print(err)
        print(code2char(x_train.view(-1),cdset.vocab))
        print(code2char(y_train.view(-1),cdset.vocab))
        print([code2char(torch.max(out[i].data,-1)[1],cdset.vocab) for i in range(n)])            
    i +=1
    err.backward()
    
    optimizer.step()

Variable containing:
 764.2690
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 196.4538
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 44.2542
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 12.3192
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 4.5499
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 2.1619
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 1.2099
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 0.7508
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 0.5176
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']
Variable containing:
 0.3875
[torch.FloatTensor of size 1]

informationsincerelygabrielmelki

['']


In [11]:
##ACCURACY ON TRAIN
acc = 0
k = 0
for x_train, y_train in dataload:
    mat_x_train = torch.FloatTensor(V, T).fill_(0)
    for j in range(T):
        mat_x_train[x_train[0][j] - 1, j] = 1
    mat_x_train = Variable(mat_x_train)

    mat_y_train = torch.FloatTensor(V, n).fill_(0)
    for j in range(n):
        mat_y_train[y_train[0][j] - 1, j] = 1
    mat_y_train = Variable(mat_y_train)
    h = Variable(torch.zeros(size_h, x_train.size()[0]).view(-1))
    c = Variable(torch.zeros(size_h, x_train.size()[0]).view(-1))
    out2 = model.forward2(h, c, mat_x_train)
    out = model.forward(h, c, mat_x_train,mat_y_train)
    acc += 1 * (out2[0] == y_train[0, 0])

    k += 1
    if(k%(int(dataload.dataset.data.size()[0]/10))==0):
        print(code2char(x_train.view(-1), cdset.vocab))
        print(code2char(y_train.view(-1), cdset.vocab))
        print(code2char(out2, cdset.vocab))
        print(code2char(torch.max(out[0].data, -1)[1], cdset.vocab))
        print(acc, k)
        print('=============')
print(acc / k)


operationsinhochi
minh
minh
minh
43 43
fieldwhichrequiresan
aptitude
aptitude
aptitude
86 86
alsoshownmyskills
in
in
in
129 129
therightfeaturesand
ratio
ratio
ratio
172 172
tripinisraelfocused
on
on
on
215 215
axaheadquartersraising25k
currently
currently
currently
258 258
ofthe250accounts
to
to
to
301 301
workinginfirm
within
within
within
344 344
overviewofmanycultures
and
and
and
386 387
0.997663551402
