In [None]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
import re
from tqdm import tqdm

# default: Load the model on the available device(s)
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct", 
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
    cache_dir="/scratch/workspace/ctpham_umass_edu-llama/.cache/"
)

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

## Two-stage experiment

In [None]:
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "data/030.jpg",
            },
            {
                "type": "text", 
                "text": open("prompts/text_extraction.md").read()},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
 
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

In [None]:
# Reverse order of extracted texts to fit manga reading order

def reverse_texts_with_tag_renaming(detected):
    # Pattern to find each row with its content
    row_pattern = r'(<row_\d+>)(.*?)(</row_\d+>)'
    # Pattern to find individual <text_n> elements within each row
    text_pattern = r'(<text_\d+>)(.*?)(</text_\d+>)'
    
    # Function to reverse texts in a single row
    def reverse_row(match):
        row_start, row_content, row_end = match.groups()
        # Find all <text_n> elements in the row
        texts = re.findall(text_pattern, row_content, re.DOTALL)
        # Reverse the order and rename tags to reflect the new order
        reversed_texts = []
        for i, (_, text_content, _) in enumerate(texts[::-1], start=1):
            # Update the tag name to match the new order (e.g., <text_1>, <text_2>, ...)
            new_text = f"<text_{i}>\n{text_content}\n</text_{i}>"
            reversed_texts.append(new_text)
        # Join reversed and renamed texts
        return f"{row_start}\n    " + "\n    ".join(reversed_texts) + f"\n{row_end}"
    
    # Apply the reverse_row function to each row match
    reversed_detected = re.sub(row_pattern, reverse_row, detected, flags=re.DOTALL)
    
    return reversed_detected

# Reverse the texts in each row, rename tags, and print the result
reversed_output = reverse_texts_with_tag_renaming(output_text[0])
print(reversed_output)

In [None]:
# Translating texts

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "data/030.jpg",
            },
            {
                "type": "text", 
                "text": open("translation.md").read().format(orig=reversed_output)},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
 
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

## Translating detected text only

In [None]:
# Stage 1: Extracting text from comic strips
import pandas as pd

json_structure = pd.read_csv('data/annotation/annotation_cleaned.csv')


messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                #"image": "images/balloon_dream/ja/000.jpg",
                "image": "data/000.jpg",
            },
            {
                "type": "text", 
                "text": open("prompts/translation_json.md").read()},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")
 
# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=2000, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text[0])

## Prep for large batches

In [None]:
# Clean json file
import json
import pandas as pd

# Function to extract book data from JSON
def extract_book_data(json_data):
    books_data = []
    for book in json_data:
        book_title = book.get("book_title", "")
        
        for page in book.get("pages", []):
            image_path = page.get("image_paths", {}).get("ja", "")
            
            # Extract each 'text_ja' entry as a separate row
            for text_entry in page.get("text", []):
                text_ja = text_entry.get("text_ja", "")
                text_en = text_entry.get("text_en", "")
                if text_ja:
                    books_data.append({
                        "book_title": book_title,
                        "image_path": image_path,
                        "text_ja": text_ja, 
                        "text_en": text_en
                    })
    return books_data

# Load JSON data from file
with open('data/annotation/annotation.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

# Extract structured data for DataFrame
books_data_structure = extract_book_data(json_data)

# Convert to DataFrame
df = pd.DataFrame(books_data_structure, columns=["book_title", "image_path", "text_ja", "text_en"])

df['book_index'] = df.groupby('image_path').cumcount()

# Update the image_book_concat column to include the book-specific index
df["image_book_concat"] = df["image_path"] + "_" + df["book_index"].astype(str)

# Display the DataFrame
df.to_csv("data/annotation/annotation_cleaned.csv", index=False)

In [None]:
# Post processing items 
import pandas as pd
df_new = pd.read_csv("data/annotation/qwen.csv")
# Define a function to process each row of 'outputs' and segment it into 'orig', 'translated', and duplicate 'image_path'
def segment_outputs(row):
    items = row['outputs'].split("\n\n")
    segmented_data = []
    
    # Parsing each item for 'orig' and 'translated' texts
    for item in items:
        orig_text = ""
        translated_text = ""
        lines = item.split("\n")
        for i, line in enumerate(lines):
            line = line.strip()
            if line.startswith("<text>"):
                orig_text = lines[i+1].strip()
            elif line.startswith("<translation>"):
                translated_text = lines[i+1].strip()
        
        segmented_data.append({
            'text_ja': orig_text,
            'translated': translated_text,
            'image_path': row['image_path']
        })
    return segmented_data

# Apply the function to each row in the DataFrame and concatenate results
all_segmented_data = [segment for _, row in df_new.iterrows() for segment in segment_outputs(row)]
segmented_df = pd.DataFrame(all_segmented_data)
df = pd.read_csv("data/annotation/annotation_cleaned.csv")
#segmented_df = segmented_df.rename(columns={'orig': 'text_ja'})
segmented_df.merge(df, on=["image_path", 'text_ja']).reset_index(drop=True).to_csv("data/annotation/qwen_7b_final.csv", index=False)
segmented_df.to_csv("data/annotation/qwen_7b_final.csv", index=False)

In [None]:
# Post processing items 
import pandas as pd
df_new = pd.read_csv("data/annotation/qwen_72.csv")
# Define a function to process each row of 'outputs' and segment it into 'orig', 'translated', and duplicate 'image_path'
def segment_outputs(row):
    items = row['outputs'].split("\n\n")
    segmented_data = []
    
    # Parsing each item for 'orig' and 'translated' texts
    for item in items:
        orig_text = ""
        translated_text = ""
        lines = item.split("\n")
        for i, line in enumerate(lines):
            line = line.strip()
            if line.startswith("<text>"):
                orig_text = lines[i+1].strip()
            elif line.startswith("<translation>"):
                translated_text = lines[i+1].strip()
        
        segmented_data.append({
            'text_ja': orig_text,
            'translated': translated_text,
            'image_path': row['image_path']
        })
    return segmented_data

# Apply the function to each row in the DataFrame and concatenate results
all_segmented_data = [segment for _, row in df_new.iterrows() for segment in segment_outputs(row)]
segmented_df = pd.DataFrame(all_segmented_data)
df = pd.read_csv("data/annotation/annotation_cleaned.csv")
#segmented_df = segmented_df.rename(columns={'orig': 'text_ja'})
segmented_df.merge(df, on=["image_path", 'text_ja']).reset_index(drop=True).to_csv("data/annotation/qwen_72b_final.csv", index=False)

## Metrics

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
import numpy as np
import pandas as pd
import sacrebleu
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
from collections import Counter
from typing import List
import torch
from rouge_score import rouge_scorer

seventytwo = pd.read_csv("../data/output/openmantra_translation-final.csv")
seventytwo.head()

In [None]:
# Merge with 7b to find text_en 
seven = pd.read_csv('../data/output/qwen_7b_final.csv')
seven.head()

In [None]:
seventytwo.columns

In [None]:
# Merging 
df = pd.merge(seventytwo,seven, left_on=["original", "path"], right_on=["text_ja", "image_path"], how="inner")[['detection_path', 'path', 'coordinates', 'outputs', 'original',
      'translation', 'translated', 'text_en']]
df.to_csv("../data/output/openmantra_comp.csv", index=False)

In [None]:
import pandas as pd
import requests

DEEPL_URL = "https://api-free.deepl.com/v2/translate"  # Use 'api.deepl.com' if you have a paid plan.

# Function to call DeepL API
def translate_text(text):
    if not text.strip():  # Skip empty text
        return text
    params = {
        "auth_key": DEEPL_API_KEY,
        "text": text,
        "target_lang": "EN"  # English target language
    }
    response = requests.post(DEEPL_URL, data=params)
    if response.status_code == 200:
        return response.json()["translations"][0]["text"]
    else:
        print(f"Error translating text: {response.status_code}, {response.text}")
        return text  # Return original text if translation fails

# Apply translation to the DataFrame
df['deepl'] = df['original'].apply(translate_text)
df.to_csv("../data/output/openmantra_comp.csv", index=False)

In [None]:
df.to_csv("../data/output/openmantra_comp.csv", index=False)

In [None]:
from sacrebleu import corpus_bleu

hypotheses = df['deepl'].tolist()  # List of system translations
references = [[ref] for ref in df['text_en'].tolist()]  # Wrap each reference in a list for sacrebleu

# Calculate SacreBLEU
sacrebleu_score = corpus_bleu(hypotheses, references).score
print(f"SacreBLEU Score: {sacrebleu_score}")

# Calculate ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for ref, hyp in zip(df['text_en'], df['deepl'])]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
print(f"Average ROUGE-L Score: {average_rouge_l}")

# METEOR Score
# Tokenize references and hypotheses
references = [ref.split() for ref in df['text_en'].tolist()]  # Tokenize each reference translation
hypotheses = [hyp.split() for hyp in df['deepl'].tolist()]  # Tokenize each system translation

# Calculate METEOR scores
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]
meteor = sum(meteor_scores) / len(meteor_scores)

# TER Score (using edit distance normalized by reference length)
def ter_score(hypotheses: List[str], references: List[List[str]]) -> float:
    total_edits = 0
    total_ref_length = 0
    for hyp, refs in zip(hypotheses, references):
        ref = refs[0]
        total_edits += nltk.edit_distance(hyp, ref)
        total_ref_length += len(ref.split())
    return total_edits / total_ref_length if total_ref_length > 0 else 0

ter = ter_score(hypotheses, references)

# ChrF score
hypotheses = df['deepl'].tolist()  # Machine-generated translations as a list of full sentences
references = [df['text_en'].tolist()]  # Wrap in a list to match sacrebleu's expected format for multiple references

# Calculate chrF score
chrf = sacrebleu.corpus_chrf(hypotheses, references).score

hypotheses = df['deepl'].tolist()  # Machine-generated translations
references = df['text_en'].tolist()  # Reference translations

# Ensure lengths match
assert len(hypotheses) == len(references), "Mismatch in length between hypotheses and references."
# BERTScore
P, R, F1 = bert_score(hypotheses, references, lang="en")  
bertscore_f1 = F1.mean().item()
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


# BLEU Score
# Ensure hypotheses and references are tokenized as lists of tokens
hypotheses = [hyp.split() for hyp in df['deepl'].tolist()]  # Tokenized system translations
references = [[ref.split()] for ref in df['text_en'].tolist()]  # Tokenized references (wrapped in a list for each)

# Calculate BLEU using NLTK with smoothing function
smoothing_function = SmoothingFunction().method4  # Use method4 for better handling of short sentences
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing_function)

print(f"BLEU Score: {bleu_score}")

# Append BLEU to the existing metrics
metrics = {
    "SacreBLEU": sacrebleu_score,
    "ROUGE-L": average_rouge_l,
    "METEOR": meteor,
    "TER": ter,
    "ChrF": chrf,
    "BERTScore F1": bertscore_f1,
    "BLEU": bleu_score
}

metrics

In [None]:
from sacrebleu import corpus_bleu

hypotheses = df['translation'].tolist()  # List of system translations
references = [[ref] for ref in df['text_en'].tolist()]  # Wrap each reference in a list for sacrebleu

# Calculate SacreBLEU
sacrebleu_score = corpus_bleu(hypotheses, references).score
print(f"SacreBLEU Score: {sacrebleu_score}")

# Calculate ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for ref, hyp in zip(df['text_en'], df['translation'])]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
print(f"Average ROUGE-L Score: {average_rouge_l}")

# METEOR Score
# Tokenize references and hypotheses
references = [ref.split() for ref in df['text_en'].tolist()]  # Tokenize each reference translation
hypotheses = [hyp.split() for hyp in df['translation'].tolist()]  # Tokenize each system translation

# Calculate METEOR scores
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]
meteor = sum(meteor_scores) / len(meteor_scores)

# TER Score (using edit distance normalized by reference length)
def ter_score(hypotheses: List[str], references: List[List[str]]) -> float:
    total_edits = 0
    total_ref_length = 0
    for hyp, refs in zip(hypotheses, references):
        ref = refs[0]
        total_edits += nltk.edit_distance(hyp, ref)
        total_ref_length += len(ref.split())
    return total_edits / total_ref_length if total_ref_length > 0 else 0

ter = ter_score(hypotheses, references)

# ChrF score
hypotheses = df['translation'].tolist()  # Machine-generated translations as a list of full sentences
references = [df['text_en'].tolist()]  # Wrap in a list to match sacrebleu's expected format for multiple references

# Calculate chrF score
chrf = sacrebleu.corpus_chrf(hypotheses, references).score

hypotheses = df['translation'].tolist()  # Machine-generated translations
references = df['text_en'].tolist()  # Reference translations

# Ensure lengths match
assert len(hypotheses) == len(references), "Mismatch in length between hypotheses and references."
# BERTScore
P, R, F1 = bert_score(hypotheses, references, lang="en")  
bertscore_f1 = F1.mean().item()
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# BLEU Score
# Ensure hypotheses and references are tokenized as lists of tokens
hypotheses = [hyp.split() for hyp in df['translation'].tolist()]  # Tokenized system translations
references = [[ref.split()] for ref in df['text_en'].tolist()]  # Tokenized references (wrapped in a list for each)

# Calculate BLEU using NLTK with smoothing function
smoothing_function = SmoothingFunction().method4  # Use method4 for better handling of short sentences
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing_function)

print(f"BLEU Score: {bleu_score}")

# Append BLEU to the existing metrics
metrics = {
    "SacreBLEU": sacrebleu_score,
    "ROUGE-L": average_rouge_l,
    "METEOR": meteor,
    "TER": ter,
    "ChrF": chrf,
    "BERTScore F1": bertscore_f1,
    "BLEU": bleu_score
}

metrics

In [None]:
from sacrebleu import corpus_bleu

hypotheses = df['translated'].tolist()  # List of system translations
references = [[ref] for ref in df['text_en'].tolist()]  # Wrap each reference in a list for sacrebleu

# Calculate SacreBLEU
sacrebleu_score = corpus_bleu(hypotheses, references).score
print(f"SacreBLEU Score: {sacrebleu_score}")

# Calculate ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for ref, hyp in zip(df['text_en'], df['translated'])]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
print(f"Average ROUGE-L Score: {average_rouge_l}")

# METEOR Score
# Tokenize references and hypotheses
references = [ref.split() for ref in df['text_en'].tolist()]  # Tokenize each reference translation
hypotheses = [hyp.split() for hyp in df['translated'].tolist()]  # Tokenize each system translation

# Calculate METEOR scores
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]
meteor = sum(meteor_scores) / len(meteor_scores)

# TER Score (using edit distance normalized by reference length)
def ter_score(hypotheses: List[str], references: List[List[str]]) -> float:
    total_edits = 0
    total_ref_length = 0
    for hyp, refs in zip(hypotheses, references):
        ref = refs[0]
        total_edits += nltk.edit_distance(hyp, ref)
        total_ref_length += len(ref.split())
    return total_edits / total_ref_length if total_ref_length > 0 else 0

ter = ter_score(hypotheses, references)

# ChrF score
hypotheses = df['translated'].tolist()  # Machine-generated translations as a list of full sentences
references = [df['text_en'].tolist()]  # Wrap in a list to match sacrebleu's expected format for multiple references

# Calculate chrF score
chrf = sacrebleu.corpus_chrf(hypotheses, references).score

hypotheses = df['translated'].tolist()  # Machine-generated translations
references = df['text_en'].tolist()  # Reference translations

# Ensure lengths match
assert len(hypotheses) == len(references), "Mismatch in length between hypotheses and references."
# BERTScore
P, R, F1 = bert_score(hypotheses, references, lang="en")  
bertscore_f1 = F1.mean().item()
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# BLEU Score
# Ensure hypotheses and references are tokenized as lists of tokens
hypotheses = [hyp.split() for hyp in df['translated'].tolist()]  # Tokenized system translations
references = [[ref.split()] for ref in df['text_en'].tolist()]  # Tokenized references (wrapped in a list for each)

# Calculate BLEU using NLTK with smoothing function
smoothing_function = SmoothingFunction().method4  # Use method4 for better handling of short sentences
bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing_function)

print(f"BLEU Score: {bleu_score}")

# Append BLEU to the existing metrics
metrics = {
    "SacreBLEU": sacrebleu_score,
    "ROUGE-L": average_rouge_l,
    "METEOR": meteor,
    "TER": ter,
    "ChrF": chrf,
    "BERTScore F1": bertscore_f1,
    "BLEU": bleu_score
}

metrics
metrics

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
import numpy as np
import sacrebleu
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import score as bert_score
from collections import Counter
from typing import List
import torch
from rouge_score import rouge_scorer

# Load data into a DataFrame
df = pd.read_csv("data/output/qwen_7b.csv")

# Filter rows to only include those where 'text_translated' and 'text_en' are strings
df = df[df['text_translated'].apply(lambda x: isinstance(x, str)) & df['text_en'].apply(lambda x: isinstance(x, str))]

# Prepare hypotheses and references
hypotheses = df['text_translated'].tolist()  # List of system translations
references = [[ref] for ref in df['text_en'].tolist()]  # Wrap each reference in a list for sacrebleu

# Calculate SacreBLEU
sacrebleu_score = corpus_bleu(hypotheses, references).score
print(f"SacreBLEU Score: {sacrebleu_score}")

# Calculate ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for ref, hyp in zip(df['text_en'], df['text_translated'])]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
print(f"Average ROUGE-L Score: {average_rouge_l}")

# METEOR Score
# Tokenize references and hypotheses
references = [ref.split() for ref in df['text_en'].tolist()]  # Tokenize each reference translation
hypotheses = [hyp.split() for hyp in df['text_translated'].tolist()]  # Tokenize each system translation

# Calculate METEOR scores
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]
meteor = sum(meteor_scores) / len(meteor_scores)

# TER Score (using edit distance normalized by reference length)
def ter_score(hypotheses: List[str], references: List[List[str]]) -> float:
    total_edits = 0
    total_ref_length = 0
    for hyp, refs in zip(hypotheses, references):
        ref = refs[0]
        total_edits += nltk.edit_distance(hyp, ref)
        total_ref_length += len(ref.split())
    return total_edits / total_ref_length if total_ref_length > 0 else 0

ter = ter_score(df['text_translated'].tolist(), [[ref] for ref in df['text_en'].tolist()])

# ChrF score
hypotheses = df['text_translated'].tolist()  # Machine-generated translations as a list of full sentences
references = [df['text_en'].tolist()]  # Wrap in a list to match sacrebleu's expected format for multiple references

# Calculate chrF score
chrf = sacrebleu.corpus_chrf(hypotheses, references).score

# BERTScore
P, R, F1 = bert_score(hypotheses, df['text_en'].tolist(), lang="en")  
bertscore_f1 = F1.mean().item()

# Organize results
metrics = {
    "BLEU": sacrebleu_score,
    "ROUGE-L": average_rouge_l,
    "METEOR": meteor,
    "TER": ter,
    "ChrF": chrf,
    "BERTScore F1": bertscore_f1
}

metrics


In [None]:
df

In [None]:
# assert equal length
qwen_7 = pd.read_csv("data/annotation/qwen_7b_final.csv")
qwen_72 = pd.read_csv("data/annotation/qwen_72b_final.csv")

print(len(qwen_7), len(qwen_72))

In [None]:
qwen_72

In [None]:
qwen_72.drop_duplicates(subset=['text_ja', 'image_path'])

In [None]:
orig = pd.read_csv("data/annotation/annotation_cleaned.csv")
orig

In [None]:
df = pd.read_csv('data/output/qwen_72b_raw.csv')
df

In [None]:
def extract_tag_text(text, tag, random=False): 
    '''
    Extract text between two tags
    '''
    if random: 
        pattern = re.compile(rf'<{tag}>(.*?)<(.*?)>', re.DOTALL)
    else:
        pattern = re.compile(rf'<{tag}>(.*?)</{tag}>', re.DOTALL)
    
    matches = pattern.findall(text)
    
    return matches[0]

import pandas as pd
import re
from tqdm import tqdm
orig_df = pd.read_csv('data/annotation/annotation_cleaned.csv')
new_df = pd.read_csv('data/output/qwen_7b_raw.csv')
outputs_cleaned, orig_cleaned = [], []
paths = sorted(list(set(orig_df.image_path.tolist())))
for i, image_path in enumerate(tqdm(paths)): 
    orig = orig_df[orig_df['image_path'] == image_path].reset_index(drop=True)
    new = new_df[new_df['image_path'] == image_path].reset_index(drop=True)
    assert len(new) == 1
    gtruth_len = len(orig)

    for j in range(0, gtruth_len): 
        try: 
            content = extract_tag_text(new['outputs'].tolist()[0], f"item_{j}").strip()
            o = extract_tag_text(content, f"text").strip()
            t = extract_tag_text(content, f"translation").strip()
            assert o == orig['text_ja'].tolist()[j].strip()
            orig_cleaned.append(o)
            outputs_cleaned.append(t)
        except Exception as e: 
            print(gtruth_len)
            print(f"Missing content at index {j} of {image_path}")
            print(new['outputs'].tolist()[0])
            print(orig['text_ja'].tolist())
            orig_cleaned.append(orig['text_ja'].tolist()[j])
            outputs_cleaned.append("N/A")
            break
    

orig_df['orig'] = orig_compared
orig_df['translation'] = outputs_cleaned
assert orig_df['text_ja'] == orig_df['orig'] 
orig_df

In [None]:
import re
text = """<item_0>    <text>     夢の翼は    </text>    <translation>     The wings of dreams are    </translation></item_0><item_1>    <text>     蝋で固めてある    </text>    <translation>     Fixed with a glue    </translation></item_1><item_2>    <text>     高く翔ぶほど    </text>    <translation>     The higher it flies    </translation></item_2><item_3>    <text>     太陽に溶かされてしまう    </text>    <translation>     It will be melted by the sun    </translation></item_3><item_4>    <text>     ーだったら    </text>    <translation>     If it were    </translation></item_4><item_5>    <text>     最初から翔ばない方がいい    </text>    <translation>     It's better not to fly from the start    </translation></item_5>"""
def extract_tag_text(text, tag, random=False):
    '''
    Extract text between tags
    '''
    if random:
        pattern = re.compile(rf'<{tag}>(.*?)<(.*?)>', re.DOTALL)
    else:
        pattern = re.compile(rf'<{tag}>(.*?)</{tag}>', re.DOTALL)
    
    matches = pattern.findall(text)
    
    return matches[0] if matches else None

print(extract_tag_text(text, 'item_0'))

In [None]:
import pandas as pd 
import re

df = pd.read_csv('data/output/qwen_7b.csv')
# Extract translations for each row
def extract_translations(row):
    # Find all translations between <translation> tags
    translations = re.findall(r"<translation>(.*?)</translation>", row['outputs'])
    
    # Match translation to text_ja in each row
    if row['text_ja'] in row['outputs']:
        # Find the index of matching Japanese text within <text> tags
        text_matches = re.findall(r"<text>(.*?)</text>", row['outputs'])
        try:
            idx = text_matches.index(row['text_ja'])
            # Return corresponding translation if available
            return translations[idx] if idx < len(translations) else ""
        except ValueError:
            return ""
    else:
        print("Not detected")
        return ""

# Apply function to each row
df['extracted_translation'] = df.apply(extract_translations, axis=1)
df.to_csv("data/output/qwen_7b.csv", index=False)


df

In [None]:
pd.read_csv("data/output/qwen_7b.csv")

In [None]:

df = pd.read_csv('data/annotation/annotation_cleaned.csv')
df

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import nltk
import numpy as np
import sacrebleu
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import corpus_bleu as nltk_corpus_bleu
from bert_score import score as bert_score
from collections import Counter
from typing import List
import torch
from rouge_score import rouge_scorer

# Load data into a DataFrame
df = pd.read_csv("data/output/qwen_7b.csv")

# Filter rows to only include those where 'text_translated' and 'text_en' are strings
df = df[df['text_translated'].apply(lambda x: isinstance(x, str)) & df['text_en'].apply(lambda x: isinstance(x, str))]

# Prepare hypotheses and references
hypotheses = df['text_translated'].tolist()  # List of system translations
references = [[ref.split()] for ref in df['text_en'].tolist()]  # Wrap each reference in a list for corpus_bleu

# Calculate SacreBLEU
sacrebleu_score = corpus_bleu(hypotheses, [[ref] for ref in df['text_en'].tolist()]).score
print(f"SacreBLEU Score: {sacrebleu_score}")

# Calculate BLEU using NLTK
bleu_score = nltk_corpus_bleu(references, [hyp.split() for hyp in hypotheses])
print(f"BLEU Score (NLTK): {bleu_score}")

# Calculate ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, hyp)['rougeL'].fmeasure for ref, hyp in zip(df['text_en'], df['text_translated'])]
average_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores) if rouge_l_scores else 0.0
print(f"Average ROUGE-L Score: {average_rouge_l}")

# METEOR Score
# Tokenize references and hypotheses
references = [ref.split() for ref in df['text_en'].tolist()]  # Tokenize each reference translation
hypotheses = [hyp.split() for hyp in df['text_translated'].tolist()]  # Tokenize each system translation

# Calculate METEOR scores
meteor_scores = [meteor_score([ref], hyp) for ref, hyp in zip(references, hypotheses)]
meteor = sum(meteor_scores) / len(meteor_scores)

# TER Score (using edit distance normalized by reference length)
def ter_score(hypotheses: List[str], references: List[List[str]]) -> float:
    total_edits = 0
    total_ref_length = 0
    for hyp, refs in zip(hypotheses, references):
        ref = refs[0]
        total_edits += nltk.edit_distance(hyp, ref)
        total_ref_length += len(ref.split())
    return total_edits / total_ref_length if total_ref_length > 0 else 0

ter = ter_score(df['text_translated'].tolist(), [[ref] for ref in df['text_en'].tolist()])

# ChrF score
hypotheses = df['text_translated'].tolist()  # Machine-generated translations as a list of full sentences
references = [df['text_en'].tolist()]  # Wrap in a list to match sacrebleu's expected format for multiple references

# Calculate chrF score
chrf = sacrebleu.corpus_chrf(hypotheses, references).score

# BERTScore
P, R, F1 = bert_score(hypotheses, df['text_en'].tolist(), lang="en")  
bertscore_f1 = F1.mean().item()

# Organize results
metrics = {
    "SacreBLEU": sacrebleu_score,
    "BLEU (NLTK)": bleu_score,
    "ROUGE-L": average_rouge_l,
    "METEOR": meteor,
    "TER": ter,
    "ChrF": chrf,
    "BERTScore F1": bertscore_f1
}

metrics
