In [6]:
from google.colab import drive
drive.mount('/content/drive/')
#/content/drive/MyDrive/batches

ModuleNotFoundError: No module named 'google.colab'

In [2]:
import pyconll
import pyconll.util
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch.nn.utils.rnn import pack_sequence, unpack_sequence
import os
from sklearn import preprocessing

#Set-up CUDA cores for training the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

True
cuda


In [7]:
#Import and pre-proccess the training data with pyconll
train = pyconll.load_from_file('data/it_isdt-ud-train.conllu')
train_prepocesed=[]
for i,sent in enumerate(train):
    sentence_preprocesed=[]
    for j,token in enumerate(sent):
        if(token.head is not None):
            sentence_preprocesed.append(token)
    train_prepocesed.append(sentence_preprocesed)



In [3]:
#Class of  objects used to store the output dependencies of the parser
class Dependencies(object):
    def __init__(self,n):
        self.n = n
        self.heads = [None] * (n+1)
        self.arcs = []

    def get_heads(self):
        return self.heads

    def add_arc(self, head, child):
        child=child
        self.heads[child]=head
        self.arcs.append((head,child))

    def contains(self,head,child):
        child=child
        if self.heads[child]==head:
            return True
        else: return False




#Class of objects implementing the Berd Encoder and LSTM Oracle 
class Oracle(object):

    def __init__(self) -> None:
        #BERT encoder
        encoder_name = "dbmdz/bert-base-italian-xxl-cased"
        self.tokenizer = AutoTokenizer.from_pretrained(encoder_name)
        self.encoder = AutoModel.from_pretrained(encoder_name)

        #LSTM Oracle
        input_size = 4608 + 1200
        hidden_size = 512
        num_layers = 1
        output_size = 3
        self.epoch = 4
        self.model = torch.nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,bidirectional=False,proj_size=output_size).to(device)

        #Loss criterion and Optimizer of the LSTM Oracle
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.0005)

    #Compute the token ids used for each word by bert model
    def tokens(self,words,lemmas):
        words_token=[102]
        for i in range(0,len(lemmas)):
            word_token=self.tokenizer.convert_tokens_to_ids(words[i])
            lemma_token=self.tokenizer.convert_tokens_to_ids(lemmas[i])
            if(word_token!=101):
                words_token.append(word_token)
            else:
                words_token.append(lemma_token)
        words_token.append(103)
        return words_token

    #Compute the contextualized embeddings of a sentence
    def encode(self,words,lemmas):
        words_token=self.tokens(words,lemmas)
        padded_input_ids = words_token
        input_tensor = torch.tensor([padded_input_ids])
        with torch.no_grad():
            outputs = self.encoder(input_tensor)
        return outputs.last_hidden_state

    #Given a tensor describing the PoS of the stack and buffer, expand it's dimensionality in order to leverage the LSTM model.
    def expand_pos_tensor(self,sent_pos_tensor):
        extended_sent_pos_tensor=[]
        for move_pos_tensor in sent_pos_tensor:
            extended_move_pos_tensor=[]
            for token_pos in move_pos_tensor:
                extended_pos=torch.full((2,200),token_pos)[0]
                extended_move_pos_tensor.append(extended_pos)
            extended_move_pos_tensor=torch.cat(extended_move_pos_tensor,dim=0)
            extended_sent_pos_tensor.append(extended_move_pos_tensor)
        extended_sent_pos_tensor=torch.stack(extended_sent_pos_tensor)
        return extended_sent_pos_tensor
    

    #Training the model with the 100 sentence batches stored in memory
    def train_on_batches(self):
        files = os.listdir("data/batches")
        for k in range(0,self.epoch):
            for i in range(0,len(files)):
                if(i<200):

                    batch_features,batch_upos,batch_moves=torch.load(f"data/batches/tensor{i}.pt")
                    batch_features=unpack_sequence(batch_features)
                    batch_upos=unpack_sequence(batch_upos)
                    batch_moves=unpack_sequence(batch_moves)
                    
                    for j in range(0,len(batch_moves)):
                        input_vector=torch.cat((batch_features[j],self.expand_pos_tensor(batch_upos[j])),dim=1)
                        predicted_output, _ = self.model(input_vector.to(device))
                        loss = self.criterion(predicted_output, batch_moves[j].to(device))
                        self.optimizer.zero_grad()
                        loss.backward()
                        self.optimizer.step()

    #By PoS and the Embeddings of the words contained in the stack and the buffer, 
    #outputs the probability vector of the three possible moves.
    def score(self,features,pos):
        input_vector=torch.cat((features,self.expand_pos_tensor(pos)),1)
        predicted_output,_ = self.model(input_vector.to(torch.float32).to(device))
        return predicted_output

    #returns the last three element of stack
    def get_stack_context(self,list):
        depth=len(list)

        if depth >= 3:
            return [list[-1], list[-2], list[-3]]

        elif depth >= 2:

            return [list[-1], list[-2], -1]

        elif depth == 1:
            return [list[-1], -1 , -1]
        else:
            return [-1, -1, -1]

    #returns the last three element of buffer
    def get_buffer_context(self,index,len_phrase):
        if(index==len_phrase-1):
            return [index,index+1,-1]
        elif(index==len_phrase):
            return [index,-1,-1]
        elif(index>len_phrase):
            return [-1,-1,-1]
        else: return [index,index+1,index+2]

    
    
    #returns the embedded PoS of stack+buffer 
    def extract_pos_features(self,phrases_pos,stacks,buffers):
        sent_pos_tensor=[]
        for i in range(0,len(stacks)):
            move_pos_tensor=[]
            stack_feature=self.get_stack_context(stacks[i])
            for el in stack_feature:
                if(el>=1):
                    move_pos_tensor.append((phrases_pos[el-1]+1)/10)
                elif(el==0):
                    move_pos_tensor.append(-0.1)
                else:
                    move_pos_tensor.append(0)
                    
            buffer_feature=self.get_buffer_context(buffers[i],len(phrases_pos))
            for el in buffer_feature:
                if(el>=1):
                    move_pos_tensor.append((phrases_pos[el-1]+1)/10)
                elif(el==0):
                    move_pos_tensor.append(-0.1)
                else:
                    move_pos_tensor.append(0)
            sent_pos_tensor.append(torch.tensor(move_pos_tensor))
        return sent_pos_tensor
    
    #given a matrix,it returns a tensor array
    def flatten_embedded_features(self,matrix):
        flat_list = torch.tensor([])
        for row in matrix:
            flat_list=torch.cat((flat_list,row))
        return flat_list

    #returns the embedding of stack+buffer 
    def extract_embedded_features(self,phrase,phrases_lemma,stacks,buffers):
        embeddings = self.encode(phrase,phrases_lemma)
        root_embeding=embeddings[0][0]
        empty_embedding=torch.tensor(np.zeros(768))
        embedded_features=[]
        for i in range(0,len(stacks)):
            stack_feature=self.get_stack_context(stacks[i])
            for j,el in enumerate(stack_feature):
                if(el>=1):
                    stack_feature[j]=embeddings[0][el]
                elif(el==0):
                    stack_feature[j]=root_embeding
                else:
                    stack_feature[j]=empty_embedding
            buffer_feature=self.get_buffer_context(buffers[i],len(phrase))
            for j,el in enumerate(buffer_feature):
                if(el>=1):
                    buffer_feature[j]=embeddings[0][el]
                else:
                    buffer_feature[j]=empty_embedding
            embedded_features.append(self.flatten_embedded_features(stack_feature+buffer_feature))
                    
        return embedded_features
    

In [None]:



class Parser(object):
    def __init__(self,oracle):
        self.oracle=oracle

    #applies the move, it goes to update the stack and the index of the buffer
    def transition(self,move, stack, i, dependencies):
        match move:
            case 0:
                stack.append(i)
                return stack,i+1,dependencies
            case 1:
                dependencies.add_arc(stack[-2], stack.pop())
                return stack,i,dependencies
            case 2:
                dependencies.add_arc(stack[-1], stack[-2])
                stack.pop(-2)
                return stack,i,dependencies
            case _:
                raise "Wrong Move"

    #returns the possible moves that can be applied in parsing
    def get_valid_moves(self,i, n, stack_depth):
        moves = []
        if i <= n:
            moves.append(0)
        if stack_depth >= 2:
            moves.append(1)
            moves.append(2)
        return moves
    
    #given the sentence, the lemma and pos, It returns the depency related to that sentence
    def parsing(self,words,phrase_lemma,phrase_pos):
        n=len(words)
        deps=Dependencies(n)
        stack=[0]
        i_buffer=1
        moves=self.get_valid_moves(i_buffer,n,len(stack))
        old_stack=[]
        old_buffer=[]
        memory=1
        while moves:
            features_embeddings = self.oracle.extract_embedded_features(phrase_pos,[stack],[i_buffer])
            features_pos = self.oracle.extract_pos_features(words,phrase_lemma,[stack],[i_buffer])

            features_embeddings=torch.stack(features_embeddings)
            features_pos=torch.stack(features_pos)
            
            scores = self.oracle.score(features_embeddings,features_pos)
            scores=scores[-1].tolist()
            
            next_move = max(moves, key=lambda move: scores[move])
            stack,i_buffer,deps = self.transition(next_move, stack, i_buffer, deps)
            moves = self.get_valid_moves(i_buffer,n,len(stack))

            if(len(old_stack)<memory):
                old_stack.append(stack)
                old_buffer.append(i_buffer)
            else:
                old_stack.pop(0)
                old_buffer.pop(0)
                old_stack.append(stack)
                old_buffer.append(i_buffer)

        return deps

    #choose the best move for imulate_parse
    def check_best(self,heads,stack,buffer,deps,i):
        move=-1
        if(len(stack)>=2):
            children_list=[]
            for child,head in enumerate(heads):
                if head == stack[-1]:
                    children_list.append(child)
            if(heads[stack[-2]]==stack[-1]):
                move=2
            if(((heads[stack[-1]])==(stack[-2])) and all([deps.contains(stack[-1],child) for child in children_list])):
                move=1
        if(i<=len(buffer) and move==-1):
            move=0
        elif(i>len(buffer) and move==-1):
            move=None
        return move
    
    #it does reverse engineering, given the final state rebuilds the stack, buffer and moves
    def simulate_parse(self,heads,buffer):
        deps=Dependencies(len(buffer))
        stack=[0]
        moves=[]
        buffers=[]
        stacks=[]
        i=1
        best_move=self.check_best(heads,stack,buffer,deps,i)
        while best_move!=None:
            buffers.append(i)
            stacks.append(stack[:])
            moves.append(best_move)
            stack,i,deps=self.transition(best_move,stack,i,deps)
            best_move=self.check_best(heads,stack,buffer,deps,i)
        if(i>len(buffer)):
            return stacks,buffers,moves
        else: return None




In [19]:
#given the phrase and its heads reconstructs the optimal moves to get those heads,also returns the embedding of the stack,buffer and pos
def encode_moves(oracle,parser,heads,phrase,phrase_lemma,phrase_pos):
    stacks,buffers,moves=parser.simulate_parse(heads,phrase)

    embedded_features = oracle.oracle.extract_embedded_features(phrase_pos,stacks,buffers)
    pos_features = oracle.oracle.extract_pos_features(phrase_lemma,phrase_lemma,stacks,buffers)

    expanded_moves=[]
    for move in moves:
        if(move==0): expanded_moves.append(torch.tensor([1,0,0]))
        if(move==1): expanded_moves.append(torch.tensor([0,1,0]))
        if(move==2): expanded_moves.append(torch.tensor([0,0,1]))

    return embedded_features,pos_features,expanded_moves

#takes the dataset and for each sentence generates the corresponding batch, will generate n=batch-size file
def create_batches(batch_size,dataset):

    oracle = Oracle()
    parser = Parser(oracle)
    batch_feature=[]
    batch_pos=[]
    batch_moves=[]

    pos_tags = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN','VERB','ADP','AUX','CCONJ','DET','NUM','PART','PRON','SCONJ','PUNCT','SYM','X']
    le = preprocessing.LabelEncoder()
    le = le.fit(pos_tags)
    

    index_batch=0
    for sent in dataset:
        if(index_batch<200):
            heads=[-1]
            words=[]
            lemmas=[]
            pos=[]

            wrong_sent=0
            for token in sent:
                if(token.head is None): wrong_sent=1
                if(token.form is None): wrong_sent=1
                if(token.lemma is None): wrong_sent=1
                if(token.upos is None): wrong_sent=1

                heads.append(int(token.head))
                words.append(token.form)
                lemmas.append(token.lemma)
                pos.append(token.upos)

            if(wrong_sent==0):

                pos_int=le.transform(pos)

                sent_features,sent_pos,sent_moves = encode_moves(oracle,parser,heads,words,lemmas,pos_int)

                sent_features=torch.stack(sent_features,dim=0).to(torch.float32)
                sent_pos=torch.stack(sent_pos).to(torch.float32)
                sent_moves=torch.stack(sent_moves).to(torch.float32)

            

                batch_feature.append(sent_features)
                batch_pos.append(sent_pos)
                batch_moves.append(sent_moves)
                


            if(len(batch_moves)==batch_size):
                #print(sent_moves)
                print(index_batch)
                
                packed_features=pack_sequence(batch_feature,enforce_sorted=False)
                packed_pos=pack_sequence(batch_pos,enforce_sorted=False)
                packed_moves=pack_sequence(batch_moves,enforce_sorted=False)
                


                torch.save((packed_features,packed_pos,packed_moves), f"data/batches/tensor{index_batch}.pt")
                index_batch+=1
                batch_feature=[]
                batch_pos=[]
                batch_moves=[]



#create_batches(100,train_prepocesed)


oracle = Oracle()
oracle.train_on_batches()
parser = Parser(oracle)



0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
Loadind 0
Loadind 1
Loadind 2
Loadind 3
Loadind 4
Loadind 5
Loadind 6
Loadind 7
Loadind 8
Loadind 9
Loadind 10
Loadind 11
Loadind 12
Loadind 13
Loadind 14
Loadind 15
Loadind 16


KeyboardInterrupt: 

In [17]:
parser = Parser(oracle)

def accuracy_on_test():
  #Load e preprocessing
  test = pyconll.load_from_file('data/it_isdt-ud-test.conllu')
  test_prepocesed=[]
  for i,sent in enumerate(test):
      sentence_preprocesed=[]
      for j,token in enumerate(sent):
          if(token.head is not None):
              sentence_preprocesed.append(token)
      test_prepocesed.append(sentence_preprocesed)

  pos_tags = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN','VERB','ADP','AUX','CCONJ','DET','NUM','PART','PRON','SCONJ','PUNCT','SYM','X']
  le = preprocessing.LabelEncoder()
  le = le.fit(pos_tags)

  #Accuracy
  total_tokens=0
  total_wrong_tokens=0
  for i,sent in enumerate(test):
    if(i<100):
      print(i)
      heads=[-1]
      words=[]
      lemmas=[]
      pos=[]

      wrong_sent=0
      for token in sent:
        if(token.head is None): wrong_sent=1
        else:
          heads.append(int(token.head))
        if(token.form is None): wrong_sent=1
        if(token.lemma is None): wrong_sent=1
        if(token.upos is None): wrong_sent=1



        words.append(token.form)
        lemmas.append(token.lemma)
        pos.append(token.upos)

      if(wrong_sent!=1):
        pos_int=le.transform(pos)
        predicted_deps=parser.parsing(words,lemmas,pos_int)
        predicted_heads=predicted_deps.get_heads()
        sent_tokens=len(words)
        sent_wrong_tokens=0
        for j in range(1,len(predicted_heads)):
          if(predicted_heads[j] is not None):
            if(predicted_heads[j]!=heads[j]):
              sent_wrong_tokens+=1
          else:
            sent_wrong_tokens+=1
        total_tokens+=sent_tokens
        total_wrong_tokens+=sent_wrong_tokens

  accuracy=1-(total_wrong_tokens/total_tokens)
  return accuracy

print(accuracy_on_test())







0


ValueError: input must have the type torch.float32, got type torch.float64

hidden 64, epoch 1, lr 0.001, memory 0, bidir False, batches 30 0.7
hidden 64, epoch 1, lr 0.01, memory 0, bidir False, batches 30 0.55 overfitting
hidden 64, epoch 1, lr 0.0001, memory 0, bidir False, batches 0.58 underfitting

hidden 64, epoch 3, lr 0.001, memory 0, bidir False, batches 30 0.74
hidden 64, epoch 4, lr 0.001, memory 0, bidir False, batches 30 0.78 top
hidden 64, epoch 4, lr 0.001, memory 3, bidir False, batches 30 0.64 !

hidden 64, epoch 4, lr 0.001, memory 0, bidir False, batches 60 0.75  overfitting
hidden 64, epoch 3, lr 0.001, memory 0, bidir False, batches 60 0.76  overfitting
hidden 64, epoch 2, lr 0.001, memory 0, bidir False, batches 60 0.78  uguale a prima
hidden 64, epoch 1, lr 0.001, memory 0, bidir False, batches 60 0.71  underfitting

hidden 64, epoch 2, lr 0.0005, memory 0, bidir False, batches 120 0.78  
hidden 64, epoch 3, lr 0.0005, memory 0, bidir False, batches 120 0.78  
hidden 128, epoch 3, lr 0.0005, memory 0, bidir False, batches 120 0.815  top
hidden 512, epoch 4, lr 0.0005, memory 0, bidir False, batches 120 0.83  top