M2 project : Convolutions and character embeddings
======================

The project aims to predict the language from which a character sequence comes from. This is done with surnames and it involves a dozen of languages.

Data download & description
---------------------

In [None]:
from urllib.request import urlretrieve

urlretrieve('http://www.linguist.univ-paris-diderot.fr/~bcrabbe/datasets/name2lang.train','name2lang.train')
urlretrieve('http://www.linguist.univ-paris-diderot.fr/~bcrabbe/datasets/name2lang.valid','name2lang.valid')

#Prints the beginning of the valid set
istream = open('name2lang.valid')
for idx, line in enumerate(istream):
  print(line.strip())
  if idx >=300:
    break
istream.close()


Barros, Portuguese
Campos, Portuguese
D'cruz, Portuguese
Henriques, Portuguese
Machado, Portuguese
Silva, Portuguese
Torres, Portuguese
Ahearn, Irish
Aonghus, Irish
Brady, Irish
Cearbhall, Irish
Flann, Irish
Kavanagh, Irish
Maguire, Irish
Mcmahon, Irish
Mcneil, Irish
Monahan, Irish
Muirchertach, Irish
Mullen, Irish
O'Connell, Irish
O'Grady, Irish
O'Hara, Irish
O'Mahony, Irish
Rory, Irish
Shannon, Irish
Sioda, Irish
Tadhgan, Irish
Abel, Spanish
Agramunt, Spanish
Aldana, Spanish
Alfaro, Spanish
Aquino, Spanish
Arena, Spanish
Blanco, Spanish
Bustos, Spanish
Cardona, Spanish
Castellano, Spanish
Del olmo, Spanish
Etxeberria, Spanish
Garrastazu, Spanish
Hierro, Spanish
Loyola, Spanish
Maradona, Spanish
Mas, Spanish
Nieves, Spanish
Ortega, Spanish
Pelaez, Spanish
Robles, Spanish
Roldan, Spanish
Suero, Spanish
Tomas, Spanish
Torres, Spanish
Tos, Spanish
Ubina, Spanish
Urena, Spanish
Valdez, Spanish
Varela, Spanish
Vasquez, Spanish
Villa, Spanish
Villaverde, Spanish
Zavala, Spanish
Pham, Vietna

First exercise : data preprocessing (3pts)
---
The first exercise amounts to create encodings from integers to strings and from strings to integers.

In [None]:
def vocabulary(filename, char_vocab, pad_token='<pad>'):
    """
    Args:
      filename (str)    : the name of the file
      char_vocab (bool) : selects if we extact char symbols  or language codes
      pad_token(str)    : the value of the pad symbol
    """
    #char_vocab is a boolean flag that tells if we extract char symbols or language codes
    index = 0 if char_vocab else 1
    with open(filename, "r") as file:
        lines = [line.split(",")[index] for line in file.readlines()]

    if char_vocab:

        idx2sym = set()
        for line in lines:
            for char in line:
                idx2sym.add(char)
        idx2sym = list(idx2sym)
        sym2idx = {token: index for index, token in enumerate(idx2sym)}
    else:
        idx2sym = list({language for language in lines})
        sym2idx = {token: index for index, token in enumerate(idx2sym)}
    idx2sym.append(pad_token)
    sym2idx[pad_token] = len(idx2sym)
    return idx2sym, sym2idx

In [None]:
voc_i2x, voc_x2i = vocabulary("name2lang.train", True)
print(voc_i2x)
print('\n', voc_x2i)

['L', 'D', 'b', 't', 'I', 's', 'A', 'Q', 'S', 'J', 'K', 'O', 'h', 'l', 'q', 'F', 'n', 'j', 'M', 'U', "'", 'f', 'w', 'x', 'k', 'm', 'X', 'C', 'a', 'u', 'Z', 'W', 'o', 'V', 'T', 'E', 'i', 'e', 'Y', 'R', 'y', 'G', 'N', 'd', 'P', ' ', 'H', 'g', 'r', 'p', 'v', 'z', 'c', 'B', '<pad>']

 {'L': 0, 'D': 1, 'b': 2, 't': 3, 'I': 4, 's': 5, 'A': 6, 'Q': 7, 'S': 8, 'J': 9, 'K': 10, 'O': 11, 'h': 12, 'l': 13, 'q': 14, 'F': 15, 'n': 16, 'j': 17, 'M': 18, 'U': 19, "'": 20, 'f': 21, 'w': 22, 'x': 23, 'k': 24, 'm': 25, 'X': 26, 'C': 27, 'a': 28, 'u': 29, 'Z': 30, 'W': 31, 'o': 32, 'V': 33, 'T': 34, 'E': 35, 'i': 36, 'e': 37, 'Y': 38, 'R': 39, 'y': 40, 'G': 41, 'N': 42, 'd': 43, 'P': 44, ' ': 45, 'H': 46, 'g': 47, 'r': 48, 'p': 49, 'v': 50, 'z': 51, 'c': 52, 'B': 53, '<pad>': 55}


In [None]:
def pad_sequence(sequence,pad_size,pad_token):

    #returns a list with additional pad tokens to match pad_size if needed
    while len(sequence) < pad_size:
        sequence = pad_token + sequence + pad_token
    return sequence

def code_sequence(charseq,encodingmap):
  #we ignore chars not seen in train set
  #charseq is a sequence of chars
  return [encodingmap[c] for c in charseq if c in encodingmap]

def decode_sequence(idxseq,decodingmap):
  #idxseq is a list of integers
  return [decodingmap[idx] for idx in idxseq]

Second exercise : data generator (2pt)
------------

The data generator aims to deliver efficiently well formed batches of data to the model.

In [None]:
def read_dataset(filename,input_symbols):
    #reads from a raw datafile, either the surnmaes if input_symbols is True otherwise it reads the language
    symbols = []
    istream = open(filename)
    for line in istream:
      if line and not line.isspace():
        word,lang = line.split(',')
        symbol = list(word.strip()) if input_symbols else lang.strip()
        symbols.append(symbol)
    istream.close()
    return symbols


In [None]:
from random import shuffle

class DataGenerator:
    def __init__(self,filename, parentgenerator = None,pad_token='<pad>'):
       self.pad_token = pad_token
       if parentgenerator is not None: #Reuse the encodings of the parent if specified
            self.input_idx2sym,self.input_sym2idx     = parentgenerator.input_idx2sym, parentgenerator.input_sym2idx
            self.output_idx2sym,self.output_sym2idx   = parentgenerator.output_idx2sym, parentgenerator.output_sym2idx
       else:
            #######################################
            self.input_idx2sym,self.input_sym2idx     = vocabulary(filename, True)
            self.output_idx2sym,self.output_sym2idx   = vocabulary(filename, False)
            ####################################
       self.X = read_dataset(filename,True)
       self.Y = read_dataset(filename,False)

    def gen_batches(self, batch_size):
        assert(len(self.X) == len(self.Y))

        N     = len(self.X)
        idxes = list(range(N))
        #Data ordering (try to explain why these 2 lines make sense...)
        shuffle(idxes)
        idxes.sort(key=lambda idx: len(self.X[idx]))
        #batch generation
        bstart = 0
        while bstart < N:
            bend        = min(bstart+batch_size,N)
            batch_idxes = idxes[bstart:bend]
            batch_len   = max(len(self.X[idx]) for idx in batch_idxes)
            Xpad        = [pad_sequence(self.X[idx],batch_len,self.pad_token)  for idx in batch_idxes]
            seqX        = [code_sequence(x,self.input_sym2idx) for x in Xpad]
            seqY        = [self.output_sym2idx[self.Y[idx]] for idx in batch_idxes]
            assert(len(seqX) == len(seqY))
            yield (seqX,seqY)
            bstart += batch_size


    def generate_batches(self,batch_size):

        print(len(self.X), len(self.Y))
        assert(len(self.X) == len(self.Y))
        N     = len(self.X)
        idxes = list(range(N))
        #Data ordering (try to explain why these 2 lines make sense...)
        shuffle(idxes)
        idxes.sort(key=lambda idx: len(self.X[idx]))

        #batch generation
        bstart = 0
        while bstart < N:
            bend        = min(bstart+batch_size,N)
            batch_idxes = idxes[bstart:bend]
            batch_len   = max(len(self.X[idx]) for idx in batch_idxes)
            Xpad        = [pad_sequence(self.X[idx],batch_len,self.pad_token)  for idx in batch_idxes]
            seqX        = [code_sequence(x,self.input_sym2idx) for x in Xpad]
            seqY        = [self.output_sym2idx[self.Y[idx]] for idx in batch_idxes]
            assert(len(seqX) == len(seqY))
            yield (seqX,seqY)
            bstart += batch_size

generator = DataGenerator("name2lang.train")
batches_generator = generator.generate_batches(16)

for elt in batches_generator:
  print(elt)

16144 16144
[14727, 2329, 15523, 3135, 14098, 15534, 10225, 517, 494, 15508, 1856, 15521, 484, 15223, 738, 1747, 15567, 508, 491, 16012, 495, 15149, 15561, 15372, 15538, 525, 15516, 15529, 15039, 15547, 532, 488, 15004, 15540, 15323, 14040, 1790, 15549, 15528, 14012, 15445, 12960, 611, 519, 737, 932, 2973, 6426, 661, 593, 961, 14915, 11349, 15520, 756, 736, 703, 694, 11860, 14343, 545, 15507, 10298, 2534, 7521, 7130, 15527, 14543, 15541, 660, 592, 1629, 533, 14097, 15553, 10354, 677, 8938, 15342, 671, 761, 4836, 4645, 14394, 605, 687, 631, 709, 547, 638, 15498, 654, 2332, 585, 9096, 511, 2457, 13740, 8943, 7696, 15644, 635, 13223, 715, 15510, 1637, 13891, 15496, 9359, 551, 764, 497, 15550, 753, 674, 539, 13813, 15533, 15568, 571, 2566, 14940, 510, 601, 14184, 15564, 9243, 14428, 712, 10335, 1762, 675, 1379, 15362, 651, 725, 14828, 10818, 574, 12622, 548, 612, 501, 516, 1254, 617, 11592, 1769, 1814, 14310, 722, 625, 421, 604, 584, 9337, 618, 658, 590, 991, 663, 512, 12729, 626, 499, 586

KeyError: ignored

Third exercise : Implement the word embedding submodule (5pts)
-----
This exercise amounts to implement a pytorch submodule that takes as input a sequence of char indexes and outputs the word embedding corresponding for the sequence.

The module contains no training method and is meant to be used in a larger network. Its use is quite similar to `nn.Embedding`




In [None]:
import torch
import torch.nn as nn

class CharConvolution(nn.Module):

      def __init__(self,windowK,chars_vocab_size,input_embedding_size,output_embedding_size,padding_idx = None):

          super(CharConvolution, self).__init__()
          #####################################
          #Initialize and allocate an embeddings class and the Conv1d class
          #####################################

      def forward(self,xinput):
          #####################################
          #Implement the forward method, taking an input of the form [batch,seq]
          #and return the max pooled result
          ######################################

Fourth Exercise : predict the target language (10pts)
-------
In this exercise, we aim to predict the target language from a word char embedding. You will implement for the `LanguageIdentifier` class:
* A forward function: the function takes as input a char index tensor and returns a vector of prediction for each word
* A train function: the function trains the model on the full dataset (with early stopping)
* A predict function: the function takes a test corpus (a list of words)
and predicts the language. The function outputs its results in textual form. Each word is printed on the same line as its predicted class.

Once implemented you are expected to search for hyperparameters in the main program.






In [None]:
import torch.optim as optim

class LanguageIdentifier(nn.Module):

    def __init__(self,datagenerator,window_size,char_embedding_size,word_embedding_size):
      super(LanguageIdentifier, self).__init__()
      invocab_size   = len(datagenerator.input_idx2sym)
      outvocab_size  = len(datagenerator.output_idx2sym)
      pad_idx        = datagenerator.input_sym2idx[datagenerator.pad_token]
      self.charE     = CharConvolution(window_size,invocab_size,char_embedding_size,word_embedding_size,padding_idx = pad_idx)
      self.output    = nn.Linear(word_embedding_size,outvocab_size)

    def load(self,filename):
        self.load_state_dict(torch.load(filename))


    def forward(self,xinput):
      #########################
      #takes as input a tensor of the form [batch,seq]
      #and returns a vector of predictions for the language

      pass   # <= TODO
      #########################

    def train(self,traingenerator,validgenerator,epochs,batch_size,device='cpu',learning_rate=0.001):

      self.minloss = 10000000 #the minimal validation loss found so far for an epoch
      ###########################
      #TODO Implement the training function, save the model with minimum loss
      pass
      ###########################

    def predict(self,datagenerator,batch_size,device):
        ##########################
        #TODO implement a prediction function that returns the class with highest score for each word in the batch
        pass
        ##########################

    def validate(self,datagenerator,batch_size,device='cpu',save_min_model=False):
        #This function cannot be modified

        batch_accurracies = []
        batch_losses      = []
        batch_sizes       = []

        device    = torch.device(device)
        loss_fnc  = nn.CrossEntropyLoss()

        for (seqX,seqY) in datagenerator.generate_batches(batch_size):

              with torch.no_grad():
                  X    = torch.LongTensor(seqX).to(device)
                  Y    = torch.LongTensor(seqY).to(device)

                  Yhat = self.forward(X)
                  loss = loss_fnc(Yhat,Y)
                  Ypred = torch.argmax(Yhat,dim=1)
                  acc   = float(torch.sum(Ypred == Y))

                  batch_losses.append(loss.item())
                  batch_accurracies.append(acc)
                  batch_sizes.append(len(Y))

        valid_loss = sum(batch_losses) / len(batch_losses)
        print('[valid]  mean loss = %f, mean acc = %f'%( valid_loss , sum(batch_accurracies)/sum(batch_sizes)))
        if valid_loss < self.minloss:
            self.minloss = valid_loss
            torch.save(self.state_dict(), 'names_params.pt')


Main program. You are expected to search for hyperparameters:

In [None]:
traing = DataGenerator('name2lang.train')
validg = DataGenerator('name2lang.valid',parentgenerator=traing)

model = LanguageIdentifier(traing,2,32,512)
model.train(traing,validg,25,128)