In [2]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/LaBSE')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import re
def evaluate(predicted_string, alignment_string):
    # # Example usage
    # alignment_string = "5-6 3-4 25p22 5-7 2p3 25p21 4-5 11-14 12-13 13-15 27p20 9-11 25p23 8-8 22-25 20-27 17-18 1-1 28-28 26p20 18-19 6p9 1-2 7-10 10-12 21-24 15-17 14-16 23-26"
    # predicted_string = "5-6 3-4 25-22 ..."  # Replace with your model's predicted alignments

    def parse_alignments(alignment_string):
        sure_alignments = set()
        possible_alignments = set()

        # Split the string into individual alignments
        alignments = alignment_string.split()

        for alignment in alignments:
            if 'p' in alignment:
                # Possible alignment
                aligned_words = tuple(map(int, alignment.split('p')))
                possible_alignments.add(aligned_words)
            else:
                # Sure alignment
                aligned_words = tuple(map(int, alignment.split('-')))
                sure_alignments.add(aligned_words)

        return sure_alignments, possible_alignments

    def calculate_f1(predicted_alignments, sure_alignments, possible_alignments):
        a_and_s = len(predicted_alignments.intersection(sure_alignments))
        a_and_p = len(predicted_alignments.intersection(possible_alignments))
        prec = a_and_p / len(predicted_alignments) if len(predicted_alignments) > 0 else 0
        rec = a_and_s / len(sure_alignments) if len(sure_alignments) > 0 else 0

        if prec + rec == 0:
            return 0
        return 2 * (prec * rec) / (prec + rec)

    def calculate_aer(predicted_alignments, sure_alignments, possible_alignments):
        a_and_s = len(predicted_alignments.intersection(sure_alignments))
        a_and_p = len(predicted_alignments.intersection(possible_alignments))
        return 1 - (a_and_s + a_and_p) / (len(predicted_alignments) + len(sure_alignments))

    
    # Parse alignments
    sure_alignments, possible_alignments = parse_alignments(alignment_string)
    predicted_alignments = parse_alignments(predicted_string)[0]  # Assuming model predicts only sure alignments

    # Calculate metrics
    f1_score = calculate_f1(predicted_alignments, sure_alignments, possible_alignments.union(sure_alignments))
    aer = calculate_aer(predicted_alignments, sure_alignments, possible_alignments.union(sure_alignments))

    # print("F1 Score:", f1_score)
    # print("AER:", aer)
    return {"F1 Score": f1_score, "AER": aer}


In [234]:
import torch
import torch.optim as optim

# Assuming x and y are your data tensors
# x: tensor of shape [m, dim]
# y: tensor of shape [n, dim]
def find_avg_vector2(x, y):
    def compute_l2_distance(tensor_a, tensor_b):
        return torch.norm(tensor_a - tensor_b, dim=1)

    def objective_initialization(c, x, y, alpha):
        avg_distance = (torch.mean(compute_l2_distance(x, c)) + torch.mean(compute_l2_distance(y, c))) / 2
        distance_diff = torch.mean(torch.abs(compute_l2_distance(x, c) - compute_l2_distance(y, c)))
        return alpha * avg_distance + (1 - alpha) * distance_diff

    def variance_objective(c, x, y, lambda_weight):
        x_distances = compute_l2_distance(x, c)
        y_distances = compute_l2_distance(y, c)
        var_x = torch.var(x_distances)
        var_diff = torch.var(torch.abs(x_distances.view(-1, 1) - y_distances.view(1, -1)))
        return lambda_weight * var_x + (1 - lambda_weight) * var_diff

    # Parameters
    alpha = 0.5
    lambda_weight = 0.5
    learning_rate = 0.01
    max_iterations = 1000
    convergence_threshold = 1e-6

    # Initialize c
    c = torch.mean(torch.cat([x.detach(), y.detach()], dim=0), dim=0).requires_grad_(True)

    # Optimizer
    optimizer = optim.Adam([c], lr=learning_rate)

    # Iterative Process
    for iteration in range(max_iterations):
        previous_loss = float('inf')

        # Step 1: Initialization Loop
        for _ in range(max_iterations):
            optimizer.zero_grad()
            loss_init = objective_initialization(c, x, y, alpha)
            loss_init.backward()
            optimizer.step()

            # Early Stopping for Step 1
            if torch.abs(previous_loss - loss_init) < convergence_threshold:
                break
            previous_loss = loss_init

        # Step 2: Variance Minimization Loop
        for _ in range(max_iterations):
            optimizer.zero_grad()
            loss_variance = variance_objective(c, x, y, lambda_weight)
            loss_variance.backward()
            optimizer.step()

            # Early Stopping for Step 2
            if torch.abs(previous_loss - loss_variance) < convergence_threshold:
                break
            previous_loss = loss_variance

        # Convergence Check for Overall Process
        if torch.abs(loss_variance - loss_init) < convergence_threshold:
            print(f"Converged in {iteration+1} overall iterations")
            break
    
    return c
    # print(f"Final vector c: {c.detach().numpy()}")

In [4]:
# Analyze the hallucination labels
import matplotlib.pyplot as plt
from OTAlign.src.model_parts import *
import math
import torch
import numpy as np
import ot
from tqdm import tqdm

def map_original_to_tokenized(x1, x2):
    mapping = {}
    tokenized_index = 0
    original_index = 0

    while original_index < len(x2) and tokenized_index < len(x1):
        original_word = x2[original_index]
        subword_sequence = ''

        indices = []
        while tokenized_index < len(x1) and (subword_sequence != original_word):
            subword = x1[tokenized_index].lstrip('##')
            if subword_sequence + subword == original_word[:len(subword_sequence + subword)]:
                subword_sequence += subword
                indices.append(tokenized_index)
                tokenized_index += 1
            else:
                break

        if not indices:
            tokenized_index += 1
        else:
            mapping[original_index] = indices

        original_index += 1

    return mapping


def get_word_embeddings(sentence, model, tokenizer):
    # Tokenize the sentence and get corresponding IDs
    inputs = tokenizer(sentence, return_tensors="pt")
    tokens = inputs.tokens()[1:-1]
    mapping = map_original_to_tokenized(tokens, sentence.split())

    # Get BERT embeddings for each token
    with torch.no_grad():
        embeddings = model.encode(sentence, output_value='token_embeddings')[1:-1]

    word_embeddings = []
    for original_idx in range(len(sentence.split())):
        subword_indices = mapping[original_idx]
        subword_embeddings = [embeddings[idx] for idx in subword_indices]
        averaged_embedding = torch.mean(torch.stack(subword_embeddings), dim=0)
        word_embeddings.append(averaged_embedding)

    assert len(word_embeddings) == len(sentence.split(" "))
    return word_embeddings

# word_level_emb = get_word_embeddings("This is an example sentence , which is meant to be encoded . Subwords are not a problem for this model .", model, model.tokenizer)

def find_avg_vector_cosine_similarity(vectors, vectors2=None):
    def cosine_distance(x, vectors):
        # Normalize the vectors
        x_norm = x / x.norm(dim=1, keepdim=True)
        vectors_norm = vectors / vectors.norm(dim=1, keepdim=True)
        # Compute cosine similarity and convert to distance
        cosine_sim = torch.mm(vectors_norm, x_norm.t())
        return 1 - cosine_sim

    def objective_function_uniform_cosine(x, vectors):
        distances = cosine_distance(x, vectors).squeeze()
        return torch.var(distances)

    def objective_function_sum_cosine(x, vectors):
        distances = cosine_distance(x, vectors).squeeze()
        return distances.mean()

    def objective_function_sum_uniform_cosine(x, vectors):
        distances = cosine_distance(x, vectors).squeeze()
        return distances.mean() + torch.var(distances)

    combined_vectors = torch.cat([vectors, vectors2], dim=0)
    x = torch.mean(combined_vectors.detach(), dim=0, keepdim=True).clone().detach().requires_grad_(True)

    optimizer = torch.optim.Adam([x], lr=0.01)
    for _ in range(3000):  # Number of iterations
        optimizer.zero_grad()
        loss = objective_function_sum_uniform_cosine(x, combined_vectors.detach())
        loss.backward()
        optimizer.step()

    optimizer = torch.optim.Adam([x], lr=0.01)
    for _ in range(3000):  # Number of iterations
        optimizer.zero_grad()
        loss = objective_function_uniform_cosine(x, combined_vectors.detach())
        loss.backward()
        optimizer.step()

    return x.detach().squeeze(0)

def find_avg_vector_cuda(vectors, vectors2=None):
    def objective_function_uniform_l2(x, vectors, ):
        distances = torch.norm(vectors - x, dim=1)
        return torch.var(distances)
        
    def objective_function_sum_l2(x, vectors, ):
        distances = torch.norm(vectors - x, dim=1)
        return distances.mean()
    
    def objective_function_sum_uniform_l2(x, vectors, ):
        distances = torch.norm(vectors - x, dim=1)
        return distances.mean() + torch.var(distances)
    
    
    combined_vectors = torch.cat([vectors, vectors2], dim=0)
    x = torch.mean(combined_vectors.detach(), dim=0, keepdim=True).clone().detach().requires_grad_(True)
    
    optimizer = torch.optim.Adam([x], lr=0.01)
    for _ in range(3000):  # Number of iterations
        optimizer.zero_grad()
        loss = objective_function_sum_uniform_l2(x, combined_vectors.detach())
        loss.backward()
        optimizer.step()
        
    optimizer = torch.optim.Adam([x], lr=0.01)
    for _ in range(3000):  # Number of iterations
        optimizer.zero_grad()
        loss = objective_function_uniform_l2(x, combined_vectors.detach())
        loss.backward()
        optimizer.step()
    
    return x.detach().squeeze(0)

def get_predicted_alignment(P):
    # Apply argmax along each row
    alignment_indices = np.argmax(P, axis=1)
    null_idx = P.shape[1] - 1
    # Generate alignment string
    alignment_string = ' '.join(f'{j+1}-{i+1}' for i, j in enumerate(alignment_indices) if j != null_idx and sum(P[i]) > 0)
    return alignment_string

def rank_align_pairs(align):
    align_pairs = align.split()
    align_pairs = [pair.split('-') for pair in align_pairs if 'p' not in pair]
    align_pairs = [(int(pair[0]), int(pair[1])) for pair in align_pairs]
    align_pairs = sorted(align_pairs, key=lambda x: x[0])
    return align_pairs 

m = 1
epsilon = 0.01
numItermax = 2000
stopThr = 1e-6

def get_ot_align(src_rep, mt_rep, m=m, epsilon=epsilon, numItermax=numItermax, stopThr=stopThr, w1=None, w2=None):
    def convert_to_numpy(s1_weights, s2_weights, C):
        if torch.is_tensor(s1_weights):
            s1_weights = s1_weights.to('cpu').numpy()
            s2_weights = s2_weights.to('cpu').numpy()
        if torch.is_tensor(C):
            C = C.to('cpu').numpy()
        return s1_weights, s2_weights, C
    
    
    C = compute_distance_matrix_l2(mt_rep, src_rep, 0.0)
    # C = compute_distance_matrix_cosine(mt_rep, src_rep, 0.0)
    
    s1_weights, s2_weights = compute_weights_uniform(mt_rep, src_rep)
    # s1_weights, s2_weights = compute_weights_norm(mt_rep, src_rep)
    
    s2_weights[-1] = 1.0
    s1_weights = s1_weights / s1_weights.sum()
    s2_weights = s2_weights / s2_weights.sum()
    
    
    # s2_weights[:-1].sum() * 0.5
    s1_weights, s2_weights, C = convert_to_numpy(s1_weights, s2_weights, C)

    m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * m
    # P = ot.partial.entropic_partial_wasserstein(s1_weights, s2_weights, C, reg=epsilon, m=m, stopThr=stopThr, numItermax=numItermax)
    P = ot.emd(s1_weights, s2_weights, C)
    # P = min_max_scaling(P)
    # P = ot.unbalanced.sinkhorn_stabilized_unbalanced(s1_weights, s2_weights, C, reg=epsilon, reg_m=(0.1, 1), stopThr=stopThr, numItermax=numItermax)
    # P = min_max_scaling(P)
    
    # P = ot.emd(s1_weights / s1_weights.sum(), s2_weights / s2_weights.sum(), C)
    tmp = 0
    
    return P, C

num_samples = len(open('data/deen/alignmentDeEn.talp', 'r').readlines())

results = []
for test_line_id in tqdm(range(num_samples)):
    # test_line_id = 5
    gold_alignment = open('data/deen/alignmentDeEn.talp', 'r').readlines()[test_line_id]
    src_text = open('data/deen/de', 'r', encoding='latin-1').readlines()[test_line_id].strip()
    tgt_text = open('data/deen/en', 'r', encoding='latin-1').readlines()[test_line_id].strip()

    # print(src_text)
    # print(tgt_text)

    src_emb = get_word_embeddings(src_text, model, model.tokenizer)
    tgt_emb = get_word_embeddings(tgt_text, model, model.tokenizer)


    src = src_text
    mt = tgt_text

    src_rep = torch.stack(src_emb, dim=0).cuda('cuda:1')
    mt_rep = torch.stack(tgt_emb, dim=0).cuda('cuda:1')

    avg_src_rep = find_avg_vector_cuda(src_rep, mt_rep)
    # avg_src_rep = find_avg_vector_cosine_similarity(src_rep, mt_rep)
    
    avg_src_rep = avg_src_rep.to(mt_rep)
    src_rep = torch.cat([src_rep, avg_src_rep.unsqueeze(0)], dim=0)

    P, C = get_ot_align(src_rep, mt_rep)


    null_idx = P.shape[1] - 1
    # clone numpy array P
    threshod = 1 / P.shape[1] * 0.5
    P_copy = P.copy()
    P_copy[P_copy<threshod] = 0

    # P_copy[P_copy<0.0001] = null_idx
    predicted_align = get_predicted_alignment(P_copy)
    # print(rank_align_pairs(gold_alignment))
    # print(rank_align_pairs(predicted_align))
    f1_aer = evaluate(predicted_align, gold_alignment.strip())
    results.append(f1_aer)
    # print(f1_aer)
    if test_line_id % 10 == 0:
        print("Average F1 Score:", np.mean([result['F1 Score'] for result in results]))
        print("Average AER:", np.mean([result['AER'] for result in results]))
    

    DEBUG = False
    if DEBUG:
        
        x = src_text.split()
        x.append('NULL')
        y = tgt_text.split()

        fig_width = max(16, len(x))  # Doubling the width to accommodate both heatmaps
        fig_height = max(6, len(y) * 0.5)  # 0.5 inch per row
        
        plt.clf()
        
        plt.figure(figsize=(fig_width, fig_height))

        # Heatmap for P
        plt.subplot(1, 2, 1)  # 1 row, 2 columns, first subplot
        plt.imshow(P, cmap='hot', interpolation='nearest')
        plt.colorbar()
        plt.xticks(ticks=np.arange(len(x)), labels=x, rotation=90)
        plt.yticks(ticks=np.arange(len(y)), labels=y)
        plt.title("Heatmap of P")
        plt.grid(which='major', axis='both', linestyle='-', color='white', linewidth=0.5)
        plt.gca().set_xticks(np.arange(-0.5, len(x), 1), minor=True)
        plt.gca().set_yticks(np.arange(-0.5, len(y), 1), minor=True)

        # Heatmap for C
        plt.subplot(1, 2, 2)  # 1 row, 2 columns, second subplot
        plt.imshow(C, cmap='hot', interpolation='nearest')
        plt.colorbar()
        plt.xticks(ticks=np.arange(len(x)), labels=x, rotation=90)
        plt.yticks(ticks=np.arange(len(y)), labels=y)
        plt.title("Heatmap of C")
        plt.grid(which='major', axis='both', linestyle='-', color='white', linewidth=0.5)
        plt.gca().set_xticks(np.arange(-0.5, len(x), 1), minor=True)
        plt.gca().set_yticks(np.arange(-0.5, len(y), 1), minor=True)

        plt.tight_layout()
        plt.show()
        tmp = 0

# get average results
print("Average F1 Score:", np.mean([result['F1 Score'] for result in results]))
print("Average AER:", np.mean([result['AER'] for result in results]))
        



  0%|          | 0/509 [00:00<?, ?it/s]

  0%|          | 1/509 [00:04<34:52,  4.12s/it]

Average F1 Score: 0.761904761904762
Average AER: 0.23809523809523814


  2%|▏         | 11/509 [00:39<29:51,  3.60s/it]

Average F1 Score: 0.6750436221444052
Average AER: 0.32471118543710675


  4%|▍         | 21/509 [01:16<29:10,  3.59s/it]

Average F1 Score: 0.7141522594270203
Average AER: 0.2856560286437653


  6%|▌         | 31/509 [01:48<25:42,  3.23s/it]

Average F1 Score: 0.7504492663572646
Average AER: 0.24968703466244838


  8%|▊         | 41/509 [02:23<24:53,  3.19s/it]

Average F1 Score: 0.7317034382412538
Average AER: 0.26857455798104646


 10%|█         | 51/509 [02:57<25:11,  3.30s/it]

Average F1 Score: 0.7390502163334972
Average AER: 0.2611827373143802


 12%|█▏        | 61/509 [03:31<25:58,  3.48s/it]

Average F1 Score: 0.7369637938901203
Average AER: 0.2631198227997992


 14%|█▍        | 71/509 [04:06<25:45,  3.53s/it]

Average F1 Score: 0.7435583495800856
Average AER: 0.2564440658623891


 16%|█▌        | 81/509 [04:41<25:19,  3.55s/it]

Average F1 Score: 0.7429960847244103
Average AER: 0.25714448147121427


 18%|█▊        | 91/509 [05:17<25:18,  3.63s/it]

Average F1 Score: 0.7378728611816092
Average AER: 0.2622056945734457


 20%|█▉        | 101/509 [05:51<22:28,  3.31s/it]

Average F1 Score: 0.7363340400357997
Average AER: 0.26381108828542227


 22%|██▏       | 111/509 [06:27<23:25,  3.53s/it]

Average F1 Score: 0.7439736951296151
Average AER: 0.256141943003476


 24%|██▍       | 121/509 [07:02<23:07,  3.58s/it]

Average F1 Score: 0.7438418751121871
Average AER: 0.2561611285999849


 26%|██▌       | 131/509 [07:38<22:08,  3.52s/it]

Average F1 Score: 0.7493122647424545
Average AER: 0.25069050967871215


 28%|██▊       | 141/509 [08:11<20:40,  3.37s/it]

Average F1 Score: 0.7501562535669754
Average AER: 0.24993034903739614


 30%|██▉       | 151/509 [08:41<17:03,  2.86s/it]

Average F1 Score: 0.7498642347298775
Average AER: 0.2502978673853552


 32%|███▏      | 161/509 [09:14<20:33,  3.54s/it]

Average F1 Score: 0.7530275271583695
Average AER: 0.24708105762422525


 34%|███▎      | 171/509 [09:47<15:34,  2.77s/it]

Average F1 Score: 0.75216406761542
Average AER: 0.24791205931377772


 36%|███▌      | 181/509 [10:22<19:19,  3.54s/it]

Average F1 Score: 0.7534393252139334
Average AER: 0.24666449807303922


 38%|███▊      | 191/509 [10:58<19:40,  3.71s/it]

Average F1 Score: 0.7537294933235837
Average AER: 0.2463784438708151


 39%|███▉      | 201/509 [11:33<18:01,  3.51s/it]

Average F1 Score: 0.7556273980645699
Average AER: 0.24449543653799358


 41%|████▏     | 211/509 [12:07<17:21,  3.49s/it]

Average F1 Score: 0.7497618346923692
Average AER: 0.25035767528363495


 43%|████▎     | 221/509 [12:40<15:22,  3.20s/it]

Average F1 Score: 0.7504297007174776
Average AER: 0.24965614777890216


 45%|████▌     | 231/509 [13:15<15:47,  3.41s/it]

Average F1 Score: 0.7526608300774748
Average AER: 0.24745752871993298


 47%|████▋     | 241/509 [13:49<14:53,  3.33s/it]

Average F1 Score: 0.7501199928998645
Average AER: 0.24995498024106066


 49%|████▉     | 251/509 [14:21<14:42,  3.42s/it]

Average F1 Score: 0.751800945322013
Average AER: 0.2482928709695129


 51%|█████▏    | 261/509 [14:53<13:01,  3.15s/it]

Average F1 Score: 0.7541807152083706
Average AER: 0.2459112936310295


 53%|█████▎    | 271/509 [15:29<13:49,  3.49s/it]

Average F1 Score: 0.7541836124809074
Average AER: 0.2459141170574789


 55%|█████▌    | 281/509 [16:05<13:47,  3.63s/it]

Average F1 Score: 0.7550469253402042
Average AER: 0.24505658567538702


 57%|█████▋    | 291/509 [16:41<12:40,  3.49s/it]

Average F1 Score: 0.7559388534273581
Average AER: 0.24416525078174747


 59%|█████▉    | 301/509 [17:17<11:50,  3.41s/it]

Average F1 Score: 0.7571868637333801
Average AER: 0.2429239951691582


 61%|██████    | 311/509 [17:53<11:50,  3.59s/it]

Average F1 Score: 0.7569393438260651
Average AER: 0.243167587360804


 63%|██████▎   | 321/509 [18:29<10:36,  3.38s/it]

Average F1 Score: 0.7551131170800789
Average AER: 0.24498517631384975


 65%|██████▌   | 331/509 [19:04<10:44,  3.62s/it]

Average F1 Score: 0.7567663341949274
Average AER: 0.24333695064975272


 67%|██████▋   | 341/509 [19:37<08:49,  3.15s/it]

Average F1 Score: 0.7583466885471525
Average AER: 0.24181709707829036


 69%|██████▉   | 351/509 [20:10<08:49,  3.35s/it]

Average F1 Score: 0.7589984459931421
Average AER: 0.24114206618298079


 71%|███████   | 361/509 [20:45<08:30,  3.45s/it]

Average F1 Score: 0.7604929556035508
Average AER: 0.23962252900488246


 73%|███████▎  | 371/509 [21:20<07:53,  3.43s/it]

Average F1 Score: 0.7578807208915004
Average AER: 0.2421738236986214


 75%|███████▍  | 381/509 [21:54<06:55,  3.25s/it]

Average F1 Score: 0.7585008771101386
Average AER: 0.24145982982822475


 77%|███████▋  | 391/509 [22:27<06:56,  3.53s/it]

Average F1 Score: 0.7584854585490062
Average AER: 0.24148713988586518


 79%|███████▉  | 401/509 [23:03<06:13,  3.46s/it]

Average F1 Score: 0.7582720870160722
Average AER: 0.24164577773657742


 81%|████████  | 411/509 [23:37<05:38,  3.45s/it]

Average F1 Score: 0.7585320424735921
Average AER: 0.24138314168428968


 83%|████████▎ | 421/509 [24:09<04:33,  3.11s/it]

Average F1 Score: 0.7590293213959203
Average AER: 0.2408855841405005


 85%|████████▍ | 431/509 [24:43<04:13,  3.25s/it]

Average F1 Score: 0.7612607586339258
Average AER: 0.23863711207910485


 87%|████████▋ | 441/509 [25:16<03:39,  3.22s/it]

Average F1 Score: 0.7619355906418935
Average AER: 0.23796049250674264


 89%|████████▊ | 451/509 [25:48<02:55,  3.03s/it]

Average F1 Score: 0.763408444592685
Average AER: 0.23648616523147664


 91%|█████████ | 461/509 [26:22<02:43,  3.41s/it]

Average F1 Score: 0.76359549539454
Average AER: 0.23630291382843538


 93%|█████████▎| 471/509 [26:56<02:00,  3.17s/it]

Average F1 Score: 0.7638096388398312
Average AER: 0.2360842910483856


 94%|█████████▍| 481/509 [27:29<01:41,  3.64s/it]

Average F1 Score: 0.7641100922743204
Average AER: 0.23577991977694326


 96%|█████████▋| 491/509 [28:00<00:57,  3.22s/it]

Average F1 Score: 0.7643437955973554
Average AER: 0.23554612716032264


 98%|█████████▊| 501/509 [28:30<00:22,  2.83s/it]

Average F1 Score: 0.7643329465183583
Average AER: 0.23557312538163735


100%|█████████▉| 508/509 [28:54<00:03,  3.41s/it]


AssertionError: 

In [6]:
print("Average F1 Score:", np.mean([result['F1 Score'] for result in results]))
print("Average AER:", np.mean([result['AER'] for result in results]))
        

Average F1 Score: 0.7645508832945358
Average AER: 0.23536543027460174


In [207]:
def get_predicted_alignment(P):
    # Apply argmax along each row
    alignment_indices = np.argmax(P, axis=1)
    null_idx = P.shape[1] - 1
    # Generate alignment string
    alignment_string = ' '.join(f'{j+1}-{i+1}' for i, j in enumerate(alignment_indices) if j != null_idx and sum(P[i]) > 0)
    return alignment_string

def rank_align_pairs(align):
    align_pairs = align.split()
    align_pairs = [pair.split('-') for pair in align_pairs if 'p' not in pair]
    align_pairs = [(int(pair[0]), int(pair[1])) for pair in align_pairs]
    align_pairs = sorted(align_pairs, key=lambda x: x[0])
    return align_pairs

null_idx = P.shape[1] - 1
# clone numpy array P
threshod = 1 / P.shape[1] * 0.5
P_copy = P.copy()
P_copy[P_copy<threshod] = 0

# P_copy[P_copy<0.0001] = null_idx
predicted_align = get_predicted_alignment(P_copy)
print(rank_align_pairs(gold_alignment))
print(rank_align_pairs(predicted_align))
evaluate(predicted_align, gold_alignment.strip())

[(13, 13), (14, 13), (15, 14), (16, 8), (17, 10), (17, 9), (17, 12), (17, 11), (18, 15)]
[(3, 1), (4, 4), (5, 10), (6, 7), (7, 9), (8, 2), (9, 8), (10, 6), (11, 5), (12, 12), (13, 13), (15, 14), (17, 11), (18, 15)]
F1 Score: 0.34782608695652173
AER: 0.5652173913043479


In [211]:
P_copy[2]



array([0.02807018, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.03859649])