In [19]:
#from libs.corpus import openConllu, check_projectivity
import pyconll
import pyconll.util
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
from torch.nn.utils.rnn import pack_sequence, unpack_sequence
import os

In [20]:

train = pyconll.load_from_file('data/it_isdt-ud-train.conllu')
train_prepocesed=[]
for i,sent in enumerate(train):        
    sentence_preprocesed=[]
    for j,token in enumerate(sent):
        if(token.head is not None):
            sentence_preprocesed.append(token)
    train_prepocesed.append(sentence_preprocesed)



In [51]:





#0:SHIFT 1:RIGHTARC 2:LEFTARC
moves = [0, 1, 2]


#creiamo un oggetto Dependencies per salvare le dependencies
class Dependencies(object):
    def __init__(self,n):
        self.n = n
        self.heads = [None] * (n+1)
        self.arcs = []
    
    def get_heads(self):
        return self.heads
    
    def add_arc(self, head, child):
        child=child
        self.heads[child]=head
        self.arcs.append((head,child))

    def contains(self,head,child):
        child=child
        if self.heads[child]==head:
            return True
        else: return False

        



class Oracle(object):
    
    def __init__(self) -> None:
        #BERT encoder
        encoder_name = "dbmdz/bert-base-italian-xxl-cased"
        self.tokenizer = AutoTokenizer.from_pretrained(encoder_name)
        self.encoder = AutoModel.from_pretrained(encoder_name)

        #LSTM oracle
        input_size = 3840  
        hidden_size = 64
        num_layers = 1
        output_size = 3  
        self.model = torch.nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,bidirectional=False,proj_size=output_size)
        self.model.half()

        #Oracle critenion and optimazation
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
    
    def tokens(self,words,lemmas):
        words_token=[102]
        for i in range(0,len(lemmas)):
            word_token=self.tokenizer.convert_tokens_to_ids(words[i])
            lemma_token=self.tokenizer.convert_tokens_to_ids(lemmas[i])
            if(word_token!=101):
                words_token.append(word_token)
            else:
                words_token.append(lemma_token)
        words_token.append(103)
        return words_token
    
    def encode(self,words,lemmas):
        words_token=self.tokens(words,lemmas)
        #max_length = 64  # Example max length ????
        padded_input_ids = words_token
        input_tensor = torch.tensor([padded_input_ids])
        with torch.no_grad():
            outputs = self.encoder(input_tensor)
        return outputs.last_hidden_state
    
    #prende i batches dalla cartella, e allena il modello
    def train_on_batches(self):
        files = os.listdir("data/batches")
        for i in range(0,len(files)):
            if(i==0):
                
                print("Loadind "+str(i))
                batch_features,batch_moves=torch.load(f"data/batches/{files[i]}") 
                print("Batch features sizes")
                predicted_output, _ = self.model(batch_features)
                predicted_output= unpack_sequence(predicted_output)
                for el in predicted_output:
                    print("prima "+str(el))
                    el = torch.nn.Softmax(dim=1)(el)  
                    print("dopo "+str(el))
                print("Predicted output sizes")
                predicted_output = torch.nn.utils.rnn.pad_sequence(predicted_output, batch_first=True)
                print(predicted_output.size())

                print("Batch moves size")
                batch_moves, _ = torch.nn.utils.rnn.pad_packed_sequence(batch_moves, batch_first=True)
                print(batch_moves.size())

                loss = self.criterion(predicted_output, batch_moves)

                #backprop
                print("Start Back-prop")
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                print("End back-prop")
                print(predicted_output)

    
    def score(self,features):
        packed_features=pack_sequence([features],enforce_sorted=False)
        predicted_output,_ = self.model(packed_features)
        print("score features "+str(packed_features))
        print("predicted output "+str(unpack_sequence(predicted_output)))

        return predicted_output
    #prende gli ultimi tre elementi dello stack
    def get_stack_context(self,list):
        depth=len(list)

        if depth >= 3:
            return [list[-1], list[-2], list[-3]]
        
        elif depth >= 2:

            return [list[-1], list[-2], -1]
        
        elif depth == 1:
            return [list[-1], -1 , -1]
        else:
            return [-1, -1, -1]

    #prende gli ultimi due elementi dell buffer
    def get_buffer_context(self,index,len_phrase):
        if(index==len_phrase):
            return [index,-1]
        elif(index>len_phrase):
            return [-1,-1]
        else: return [index,index+1]  

    def flatten_embedded_features(self,matrix):
        flat_list = torch.tensor([])
        for row in matrix:
            flat_list=torch.cat((flat_list,row))
        return flat_list

    def extract_features(self,phrase,phrases_lemma,stacks,buffers):
        embeddings = self.encode(phrase,phrases_lemma)
        root_embeding=torch.tensor(np.ones(768))
        empty_embedding=torch.tensor(np.zeros(768))
        embedded_features=[]
        for i in range(0,len(stacks)):
            stack_feature=self.get_stack_context(stacks[i])
            for j,el in enumerate(stack_feature):
                if(el>=1):
                    stack_feature[j]=embeddings[0][el]
                elif(el==0):
                    stack_feature[j]=root_embeding
                else:
                    stack_feature[j]=empty_embedding
            buffer_feature=self.get_buffer_context(buffers[i],len(phrase))
            for j,el in enumerate(buffer_feature):
                if(el>=1):
                    buffer_feature[j]=embeddings[0][el]
                else:
                    buffer_feature[j]=empty_embedding

            embedded_features.append(self.flatten_embedded_features(stack_feature+buffer_feature).to(torch.float16))#.to(torch.float16)
        return embedded_features



class Parser(object):
    def __init__(self,oracle):
        self.oracle=oracle

    #applica la mossa andando ad aggiornare lo stack e l'indice del buffer  
    def transition(self,move, stack, i, dependencies):
        match move:
            case 0:
                stack.append(i)
                return stack,i+1,dependencies
            case 1:
                dependencies.add_arc(stack[-2], stack.pop())
                return stack,i,dependencies
            case 2:
                dependencies.add_arc(stack[-1], stack[-2])
                stack.pop(-2)
                return stack,i,dependencies
            case _:
                raise "Wrong Move"

    #ritorna le mosse possibili che si possono applicare        
    def get_valid_moves(self,i, n, stack_depth):
        moves = []
        if i <= n:
            moves.append(0)
        if stack_depth >= 2:
            moves.append(1)
            moves.append(2)
        return moves

    def parsing(self,words,phrases_lemma):
        n=len(words)
        deps=Dependencies(n)
        stack=[0]
        i_buffer=1
        moves=self.get_valid_moves(i_buffer,n,len(stack))
        while moves:
            features = self.oracle.extract_features(words,phrases_lemma,[stack],[i_buffer])
            features=torch.stack(features)
            scores = self.oracle.score(features)
            print("features  "+str(features))
            print("features  dimensione "+str(features.size()))
            print(scores)
            next_move = max(moves, key=lambda move: scores[move])
            stack,i_buffer,deps = self.transition(next_move, stack, i_buffer, deps)
            moves = self.get_valid_moves(i,n,len(stack))
        return deps
    
    #sceglie la mossa migliore da eseguire nel simulate_parse
    def check_best(self,heads,stack,buffer,deps,i):
        move=-1
        if(len(stack)>=2):
            children_list=[]
            for child,head in enumerate(heads):
                if head == stack[-1]:
                    children_list.append(child)
            if(((heads[stack[-1]])==(stack[-2])) and all([deps.contains(stack[-1],child) for child in children_list])):
                move=1
            if(heads[stack[-2]]==stack[-1]):
                move=2
        if(i<=len(buffer) and move==-1):
            move=0
        elif(i>len(buffer) and move==-1):
            move=None
        return move
    
    #fa reverse engineering, dato lo stato finale ricostruisce lo stack, buffer e le mosse
    def simulate_parse(self,heads,buffer):
        deps=Dependencies(len(buffer))
        stack=[0]
        moves=[]
        buffers=[]
        stacks=[]
        i=1
        best_move=self.check_best(heads,stack,buffer,deps,i)
        while best_move!=None:
            buffers.append(i)
            stacks.append(stack[:])
            moves.append(best_move)
            stack,i,deps=self.transition(best_move,stack,i,deps)
            best_move=self.check_best(heads,stack,buffer,deps,i)
        if(i>len(buffer)):
            return stacks,buffers,moves
        else: return None 

    


In [53]:
def encode_moves(oracle,parser,heads,phrase,phrases_lemma):
    stacks,buffers,moves=parser.simulate_parse(heads,phrase)
    embedded_features=oracle.extract_features(phrase,phrases_lemma,stacks,buffers)
    

    expanded_moves=[]
    for move in moves:
        if(move==0): expanded_moves.append(torch.tensor([1,0,0]).to(torch.float16))
        if(move==1): expanded_moves.append(torch.tensor([0,1,0]).to(torch.float16))
        if(move==2): expanded_moves.append(torch.tensor([0,0,1]).to(torch.float16))

    return embedded_features,expanded_moves

#prende il dataset e per ogni frase genera il batch corrispettivo, genererà n=batch-size file    
def create_batches(oracle,parser,batch_size,dataset):
    batch_feature=[]
    batch_moves=[]

    index_batch=0
    for i,sent in enumerate(dataset):     
        if(i<1000):
            heads=[-1]
            words=[]
            lemmas=[]

            wrong_sent=0
            for token in sent:
                if(token.head is None): wrong_sent=1
                if(token.form is None): wrong_sent=1
                if(token.lemma is None): wrong_sent=1

                heads.append(int(token.head))
                words.append(token.form)
                lemmas.append(token.lemma)
            
            if(wrong_sent==0):
                sent_features,sent_moves = encode_moves(oracle,parser,heads,words,lemmas)
                sent_features=torch.stack(sent_features,dim=0)
                sent_moves=torch.stack(sent_moves)
                batch_feature.append(sent_features)
                batch_moves.append(sent_moves)
            

            if(len(batch_moves)==batch_size): 
                #print(sent_moves)
                print(index_batch)
                packed_features=pack_sequence(batch_feature,enforce_sorted=False)
                packed_moves=pack_sequence(batch_moves,enforce_sorted=False)


                torch.save((packed_features,packed_moves), f"data/batches/tensor{index_batch}.pt")
                index_batch+=1
                batch_feature=[]
                batch_moves=[]

#DA FARE:
#1. Salvare batch da 50-100 frasi su file dati, ossia per ogni frase gli stati con relativi stack,buffer e move. 
#2. Importare batch per batch come nell'esempio di chat gpt.
#3. Per ogni batch fare forward e back-prop di adam optimizer come nell'esempio di chat_gpt.



#p1,m1=parser.encode_moves(heads,phrase,phrase)
#p2,m2=parser.encode_moves(heads2,phrase2,phrase2)
#print(m2)
#torch.set_printoptions(profile="full")
#p1 = torch.stack(p1, dim=0)
#p2 = torch.stack(p2, dim=0)
#print(p1)
#print(p2)
#input=pack_sequence([p1, p2])

#input_size = 3840  # Each element in the sequence is a vector of size 2
#hidden_size = 64
#num_layers = 1
#output_size = 3  # Example output size
#model = torch.nn.LSTM(input_size,hidden_size,num_layers,batch_first=True,bidirectional=False,proj_size=output_size)
#model.half()
#
#output = model(input)
#print(output)
#print("OUTPUT")
#print(unpack_sequence(output[0]))


oracle = Oracle()
parser = Parser(oracle)
#create_batches(oracle,parser,100,train_prepocesed)
oracle.train_on_batches()
phrases="Hamad Butt è morto nel 1994 a 32 anni .".split()
deps = parser.parsing(phrases,phrases)
print(deps.get_heads())



Loadind 0
Batch features sizes
prima tensor([[-0.1899, -0.2262,  0.1254],
        [-0.2349, -0.0619, -0.0275],
        [-0.0296, -0.0618,  0.0053],
        [ 0.1565, -0.3350,  0.0521]], dtype=torch.float16,
       grad_fn=<IndexBackward0>)
dopo tensor([[0.2998, 0.2891, 0.4109],
        [0.2925, 0.3477, 0.3599],
        [0.3330, 0.3223, 0.3447],
        [0.3979, 0.2435, 0.3586]], dtype=torch.float16,
       grad_fn=<SoftmaxBackward0>)
prima tensor([[-0.1422, -0.3765,  0.1070],
        [ 0.0461,  0.0300, -0.0256],
        [ 0.0962, -0.0557, -0.1694],
        [-0.0392, -0.2137, -0.0226],
        [ 0.0864, -0.2854,  0.4644],
        [ 0.0186, -0.1711,  0.3250],
        [-0.0444, -0.0676,  0.0260],
        [ 0.2876, -0.2954,  0.1586],
        [ 0.1676, -0.2306, -0.2898],
        [ 0.3630, -0.2444,  0.1216]], dtype=torch.float16,
       grad_fn=<IndexBackward0>)
dopo tensor([[0.3252, 0.2573, 0.4172],
        [0.3430, 0.3376, 0.3193],
        [0.3809, 0.3271, 0.2920],
        [0.3501, 0.2939,