# Proyecto DL

### Librerias

In [None]:
!pip install flair
!pip install rouge
!pip install sacrebleu

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import numpy as np
from rouge import Rouge
import sacrebleu
from flair.models import SequenceTagger
from flair.data import Sentence

### Modelo y función de translate

https://huggingface.co/Helsinki-NLP/opus-mt-es-en

In [None]:
model_name = 'Helsinki-NLP/opus-mt-es-en'

tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
#model

In [None]:
def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)

    translated = model.generate(**inputs)

    translation = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translation

In [None]:
input = "He aprendido mucho sobre modelos de lenguaje en este proyecto"
reference = "I have learned a lot about language models in this project"
candidate = translate(input)

print("Candidate:", candidate)
print("Referencia:", reference)

### Metrica Rouge

https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499

https://github.com/pltrdy/rouge

In [None]:
scores = Rouge().get_scores(candidate, reference)
scores

### Metrica bleu

https://aclanthology.org/P02-1040.pdf

In [None]:
bleu_score = sacrebleu.corpus_bleu([candidate], [[reference]])
bleu_score.score

In [None]:
def bleu(candidate, reference):
    return sacrebleu.corpus_bleu([candidate], [[reference]]).score

def rouge(candidate, reference):
    return Rouge().get_scores(candidate, reference)[0]['rouge-l']['f']

### Dataset

https://www.dropbox.com/scl/fi/xtvls58le65vxjfkmuv9x/Sentence-pairs-in-Spanish-English-2024-07-03.tsv?rlkey=3la4vmua8d5f5ltdknfhmcmyt&st=6cdj4k63&dl=0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!wget -O Sentence_pairs_in_Spanish_English.tsv https://www.dropbox.com/scl/fi/ylj56w1b64tooyjdkvh9w/Sentence_pairs_in_Spanish_English.tsv?rlkey=h55bzeg30i12zohkqb28mooei&st=bbod2iu1&dl=0

In [None]:
column_names = ['id_es', 'spanish', 'id_en', 'english']

data = pd.read_csv('/content/drive/MyDrive/Sentence pairs in Spanish-English - 2024-07-03.tsv', sep='\t', names=column_names, header=None)

In [None]:
column_names = ['id_es', 'spanish', 'id_en', 'english']
data = pd.read_csv('Sentence_pairs_in_Spanish_English.tsv', sep='\t', names=column_names, header=None)

print(data[0:5])

### NER(Named-entity recognition)

In [None]:
data.head()
#data.describe()

In [None]:
length = 5
spanish = data['spanish'].tolist()[0:length]
candidates = [translate(text) for text in spanish]
references = data['english'].tolist()[0:length]
print(spanish)
print(candidates)
print(references)

In [None]:
length = 500
print_interval = 20
data_spanish = data['spanish'].tolist()[0:length]

def make_columns():
    spanish = []
    candidates = []
    references = []
    bleu_scores = []
    rogue_scores = []

    # for multiple references take the best one
    prev = '-1'
    for i in range(len(data_spanish)):
        if i % print_interval == 0:
            print(i)

        input = data_spanish[i]
        if input != prev: # si es la primera referencia
            spanish.append(input)
            candidates.append(translate(input))
            references.append(data['english'][i])
            new_rogue_score = rouge(candidates[-1], references[-1])
            rogue_scores.append(new_rogue_score)
            new_bleu_score = bleu(candidates[-1], references[-1])
            bleu_scores.append(new_bleu_score)

        else: # para las otras referencias, si les va mejor reemplazamos
            new_reference = data['english'][i]
            new_rogue_score = rouge(candidates[-1], new_reference)
            new_bleu_score = bleu(candidates[-1], new_reference)
            if new_rogue_score > rogue_scores[-1] and new_bleu_score > bleu_scores[-1]:
                references[-1] = new_reference
                rogue_scores[-1] = new_rogue_score
                bleu_scores[-1] = new_bleu_score
        prev = input

    return spanish, references, candidates, bleu_scores, rogue_scores


spanish, references, candidates, bleu_scores, rogue_scores = make_columns()

In [None]:
for col in (spanish, references, candidates, bleu_scores, rogue_scores):
    print(col[0:5])

https://huggingface.co/flair/upos-english-fast

In [None]:
tagger = SequenceTagger.load("flair/upos-english-fast")
# tagger = SequenceTagger.load("flair/upos-english")

In [None]:
def text_to_pos(text):
    sentence = Sentence(text)
    tagger.predict(sentence)
    pos_list = [word.value for word in sentence.labels]
    return " ".join(pos_list)

pos_references = []
pos_candidates = []
rogue_pos = []
bleu_pos = []
for i in range(len(candidates)):
  if i % print_interval == 0:
    print(i)
  pos_references.append(text_to_pos(references[i]))
  pos_candidates.append(text_to_pos(candidates[i]))
  rogue_pos.append(rouge(pos_candidates[i], pos_references[i]))
  bleu_pos.append(bleu(pos_candidates[i], pos_references[i]))

for col in (pos_references, pos_candidates, rogue_pos, bleu_pos):
    print(col[0:5])

In [None]:
# columns = {'spanish', 'candidate', 'reference', 'bleu', 'rogue', 'pos_reference', 'pos_candidate', 'bleu_pos', 'rogue_pos'}
columns = {'spanish': spanish, 'reference': references, 'candidate': candidates, 'bleu': bleu_scores, 'rogue': rogue_scores,
           'pos_reference': pos_references, 'pos_candidate': pos_candidates, 'bleu_pos': bleu_pos, 'rogue_pos': rogue_pos}
df = pd.DataFrame(columns)

In [None]:
df.head()

In [None]:
# Save dataframe df to tsv (tab separated values)
df.to_csv('metrics500fast(1+5min).tsv', sep='\t', index=False)

# Análisis

In [None]:
# load dataframe from tsv:
df = pd.read_csv('metrics500(15min).tsv', sep='\t')

In [None]:
# plot df.rogue
sorted_rogue = df.sort_values(by=['rogue']).rogue
sorted_rogue_reset = sorted_rogue.reset_index(drop=True)
sorted_rogue_reset.plot()


# Otros codigos útiles:

In [None]:
data_spanish = data['spanish'].tolist()[0:length]
spanish = []
candidates = []
references = []
bleu_scores = []
rogue_scores = []

prev = '-1'
for i in range(len(data_spanish)):
    input = data_spanish[i]
    if input != prev:
        spanish.append(input)
        candidates.append(translate(input))
        references.append(data['english'][i])
        new_rogue_score = rouge(candidates[-1], references[-1])
        rogue_scores.append(new_rogue_score)
        new_bleu_score = bleu(candidates[-1], references[-1])
        bleu_scores.append(new_bleu_score)
    else: # Repetido
        new_reference = data['english'][i]
        new_rogue_score = rouge(candidates[-1], new_reference)
        new_bleu_score = bleu(candidates[-1], new_reference)
        if new_rogue_score > rogue_scores[-1] and new_bleu_score > bleu_scores[-1]:
            references[-1] = new_reference
            rogue_scores[-1] = new_rogue_score
            bleu_scores[-1] = new_bleu_score
    prev = input


print(spanish)
print(candidates)
print(references)
print(bleu_scores)
print(rogue_scores)

In [None]:
data_spanish = data['spanish'].tolist()[0:length]
spanish = []
candidates = []
references = []

prev = '-1'
for i in range(len(data_spanish)):
    input = data_spanish[i]
    if input != prev:
        spanish.append(input)
        candidates.append(translate(input))
        reference = [data['english'][i]]
        references.append(reference)
    else:
        references[-1].append(data['english'][i])
    prev = input


print(spanish)
print(candidates)
print(references)

https://github.com/marcelomendoza/IIC3670/blob/main/codes/8%20-%20SEQUENCE%20LABELING%2C%20POS%20TAGGING%20Y%20NER.ipynb