In [None]:
# copyright: https://github.com/cisnlp/MEXA

In [None]:
import os 

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

token = "hf_xxXxXxXXXXxxxxxXXxxxxxXXXXXXXxXXxx"

model_path = "meta-llama/Llama-3.1-8B"

model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', cache_dir = '../transformers_cache/', use_auth_token=token)                                                         
tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=token)
tokenizer.pad_token =  tokenizer.eos_token

In [None]:
import torch
import numpy as np

def weighted_embeddings(layer, attention_mask, device='cuda'):
    
    # Compute the weights for non-padding tokens
    weights_for_non_padding = attention_mask * torch.arange(start=1, end=layer.shape[1] + 1, device=device).unsqueeze(0)

    # Sum the embeddings weighted by non-padding tokens
    sum_embeddings = torch.sum(layer * weights_for_non_padding.unsqueeze(-1), dim=1)
    num_of_none_padding_tokens = torch.sum(weights_for_non_padding, dim=-1).unsqueeze(-1)
    
    # Compute the sentence embeddings
    sentence_embeddings = sum_embeddings / num_of_none_padding_tokens
    sentence_embeddings = sentence_embeddings.squeeze().cpu().numpy()
    return sentence_embeddings

In [None]:
def get_embedding_layers(text,device='cuda'):

    # Tokenize the input text
    tokens = tokenizer(text, return_tensors='pt', padding=True).to(device)


    sentence_embeddings_weighted = []
    sentence_embeddings_last_token = []
    with torch.no_grad():

        # Forward pass through the model
        hidden_state_layers = model(**tokens, output_hidden_states=True)["hidden_states"]

        for layer in hidden_state_layers:

            # Ensure attention mask is on the same device
            attention_mask = tokens.attention_mask.to(device)
            # Only use the last token in the attention mask
            attention_mask_last = torch.zeros_like(attention_mask).to(device)
            attention_mask_last[:, -1] = 1

            embd_weighted = weighted_embeddings(layer, attention_mask, device)
            embd_last_token = weighted_embeddings(layer, attention_mask_last, device)

            sentence_embeddings_weighted.append(embd_weighted)
            sentence_embeddings_last_token.append(embd_last_token)

    return sentence_embeddings_weighted, sentence_embeddings_last_token

In [None]:
import os
import json

# Directory where the files are located
directory = '../datasets/parallel/code_snippets/'

# Initialize an empty dictionary to store results
result_dict = {}


all_langs = ['C', 'C++', 'C#', 'Java', 'Javascript', 'PHP', 'Python']

all_filenames = os.listdir(os.path.join(directory, all_langs[0]))
# Iterate through the files in the input directory
for lang in all_langs:
    sentences = []
    for idx, file_name in enumerate(all_filenames):    
        
        file_path = os.path.join((os.path.join(directory, lang)), file_name)
        with open(file_path, 'r', encoding='utf-8') as infile:
            for line in infile:
                entry = json.loads(line)
                sentences.append({'id': idx + 1, 'text': entry['snippet']})
    
    result_dict[lang] = sentences

In [None]:
sentences = []
for idx, file_name in enumerate(all_filenames):    

    file_path = os.path.join((os.path.join(directory, 'C')), file_name)
    with open(file_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            entry = json.loads(line)
            sentences.append({'id': idx + 1, 'text': entry['comment']})

    result_dict['eng_Latn'] = sentences

In [None]:
from tqdm import tqdm
import json
import pickle

embeddings_dict = {}

os.makedirs(f"./embd-{model_path.replace('/', '-')}", exist_ok=True)  # Create the directory if it doesn't exist


# Extract embeddings -- limit to top_text
for language, texts in tqdm(result_dict.items()):
    embeddings_dict = {}

    for text in texts:

        embds_weighted, embds_last_token = get_embedding_layers(text['text'])

        for layer in range(len(embds_weighted)):
            
            if layer not in embeddings_dict:
                embeddings_dict[layer] = []

            embeddings_dict[layer].append({'id': text['id'], 'embd_wighted': embds_weighted[layer], 'embd_last_token': embds_last_token[layer]})


    with open(f"./embd-{model_path.replace('/', '-')}/{language}.pkl", "wb") as pickle_file:
        pickle.dump(embeddings_dict, pickle_file)


In [None]:
import os
import json
import pickle
import numpy as np
from tqdm import tqdm
from scipy.spatial.distance import cosine
import argparse

def cosine_similarity(array1, array2):
    cosine_dist = cosine(array1, array2)
    cosine_similarity = 1 - cosine_dist
    return cosine_similarity

def mexa(matrix):
    n = len(matrix)  # size of the square matrix
    count = 0
    
    for i in range(n):
        # Get the diagonal element
        diag_element = matrix[i][i]
        
        # Get the row and column
        row = matrix[i]
        column = matrix[:,i]
        
        # Check if the diagonal element is strictly greater than all other elements in its row (excluding itself)
        if diag_element > max(np.delete(row, i)):
            # Check if the diagonal element is strictly greater than all other elements in its column (excluding itself)
            if diag_element > max(np.delete(column, i)):
                count += 1

    # Normalized count
    count_norm = count / n
    return count_norm

def compute_distance(lang, embedding_type='embd_weighted', num_sents=100):
    with open(os.path.join(embedding_path, f"{lang}.pkl"), "rb") as pickle_file:
        lang_embd = pickle.load(pickle_file)    

    similarities_dict = {}
    for layer in lang_embd.keys():
        pivot_embd_layer = pivot_embd[layer][:num_sents]
        lang_embd_layer = lang_embd[layer][:num_sents]
        
        # Initialize the similarities_dict matrix for each layer
        num_actual_sentences = min(len(pivot_embd_layer), len(lang_embd_layer))
        similarities_dict[layer] = np.zeros((num_actual_sentences, num_actual_sentences))
        
        # Compute similarities
        for p_id, pivot_single in enumerate(pivot_embd_layer):
            for l_id, lang_single in enumerate(lang_embd_layer):
                similarities_dict[layer][p_id, l_id] = cosine_similarity(pivot_single[embedding_type], lang_single[embedding_type])

    alignments = {}
    for layer in lang_embd.keys():
        alignments[layer] = mexa(similarities_dict[layer])
    
    return alignments

In [None]:

embedding_path = f"./embd-{model_path.replace('/', '-')}/"

embedding_type = 'embd_wighted'

num_sents = 100


all_langs = ['C', 'C++', 'C#', 'Java', 'Javascript', 'PHP', 'Python', 'eng_Latn']

for latent_lang in all_langs:
    
    save_path = f"mexa-{latent_lang}-{model_path.replace('/', '-')}"

    # Load the pivot embeddings
    with open(os.path.join(embedding_path, f'{latent_lang}.pkl'), "rb") as pickle_file:
        pivot_embd = pickle.load(pickle_file)

    languages = [filename[:-len('.pkl')] for filename in os.listdir(embedding_path) if filename.endswith('.pkl')]

    for lang in tqdm(languages):
        alignment_lang = compute_distance(lang, embedding_type=embedding_type, num_sents=num_sents)
        save_filepath = os.path.join(save_path, f"{lang}.json")
        os.makedirs(os.path.dirname(save_filepath), exist_ok=True)

        with open(save_filepath, "w") as json_file:
            json.dump(alignment_lang, json_file)

In [None]:
import os
import json
import matplotlib.pyplot as plt


os.makedirs(f"./mexa-{model_path.replace('/', '-')}-figures", exist_ok=True)  # Create the directory if it doesn't exist

# List of languages
all_langs = ['C', 'C++', 'C#', 'Java', 'Javascript', 'PHP', 'Python',]

# Loop over each folder
for latent_lang in all_langs:
    input_path = f"mexa-{latent_lang}-{model_path.replace('/', '-')}"

    # Check if the folder exists
    if not os.path.exists(input_path):
        print(f"Folder {input_path} does not exist.")
        continue

    plt.figure(figsize=(10, 6))
    plt.title(f"Data in Folder: {input_path}")
    plt.xlabel("Indices")
    plt.ylabel("Values")
    plt.ylim(0, 1)
    # Loop over each language to read the JSON files
    for lang in all_langs:
        json_file_path = os.path.join(input_path, f"{lang}.json")

        # Check if the JSON file exists
        if not os.path.exists(json_file_path):
            print(f"File {json_file_path} does not exist.")
            continue

        # Read the JSON file
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        # Extract x and y values
        x = list(map(int, data.keys()))
        y = list(data.values())

        # Plot the data with a unique color for each language
        plt.plot(x, y, label=lang)

    plt.legend(title="Languages")
    plt.grid(True)

    # Save the plot
    plt.savefig(f"./mexa-{model_path.replace('/', '-')}-figures/{input_path}_plot.png")
    plt.close()

    print(f"Plot saved for folder {input_path}.")

In [None]:
# combine layers

In [None]:
import os
import json
import matplotlib.pyplot as plt

# List of languages
all_langs = ['C', 'C++', 'C#', 'Java', 'Javascript', 'PHP', 'Python']

# Create a big figure for subplots
fig, axes = plt.subplots(nrows=len(all_langs), ncols=1, figsize=(10, 6 * len(all_langs)), sharex=True)
fig.tight_layout(pad=4.0)
fig.suptitle("All Language Data Plots", fontsize=16)

# Loop over each folder
for i, latent_lang in enumerate(all_langs):
    input_path = f"mexa-{latent_lang}-{model_path.replace('/', '-')}"
    ax = axes[i] if len(all_langs) > 1 else axes  # Handle case with one subplot

    # Check if the folder exists
    if not os.path.exists(input_path):
        print(f"Folder {input_path} does not exist.")
        ax.set_title(f"Folder {input_path} does not exist.")
        ax.axis('off')
        continue

    ax.set_title(f"Data in Folder: {input_path}")
    ax.set_xlabel("Indices")
    ax.set_ylabel("Values")
    ax.set_ylim(0, 1)

    # Loop over each language to read the JSON files
    for lang in all_langs:
        json_file_path = os.path.join(input_path, f"{lang}.json")

        # Check if the JSON file exists
        if not os.path.exists(json_file_path):
            print(f"File {json_file_path} does not exist.")
            continue

        # Read the JSON file
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        # Extract x and y values
        x = list(map(int, data.keys()))
        y = list(data.values())

        # Plot the data with a unique color for each language
        ax.plot(x, y, label=lang)

    ax.legend(title="Languages")
    ax.grid(True)

# Save the combined figure
os.makedirs(f"./mexa-{model_path.replace('/', '-')}-figures", exist_ok=True)
plt.savefig(f"./mexa-{model_path.replace('/', '-')}-figures/all_languages_plot.png")
plt.close()

print("Combined plot saved as all_languages_plot.png.")


In [None]:
import os
import json
import matplotlib.pyplot as plt

# List of languages
all_langs = ['C#', 'Java', 'C', 'C++', 'Javascript', 'PHP', 'Python']

# Initialize the plot
plt.figure(figsize=(8, 6))
# plt.title("Accumulated Results for Each Latent Language")
plt.xlabel("Layers")
plt.ylabel("Alignment Score")
plt.ylim(0, 1)
plt.grid(True)

markers = ['o', 's', 'D', '^', 'v', 'p', '*', 'X']
marker_idx = 0  # Index to track the marker to use

# Loop over each latent language
for latent_lang in all_langs:
    input_path = f"mexa-{latent_lang}-{model_path.replace('/', '-')}"

    # Check if the folder exists
    if not os.path.exists(input_path):
        print(f"Folder {input_path} does not exist.")
        continue

    accumulated_values = {}

    # Loop over each language file
    for lang in all_langs:
        json_file_path = os.path.join(input_path, f"{lang}.json")

        # Check if the JSON file exists
        if not os.path.exists(json_file_path):
            print(f"File {json_file_path} does not exist.")
            continue

        # Read the JSON file
        with open(json_file_path, 'r') as file:
            data = json.load(file)

        # Accumulate values
        for key, value in data.items():
            key = int(key)
            accumulated_values[key] = accumulated_values.get(key, 0) + value

    if accumulated_values:
        # Extract x and y values for plotting
        x = sorted(accumulated_values.keys())
        y = [accumulated_values[k] / len(all_langs) for k in x]  # Average the values
        
        print(latent_lang, sum(y))
        
        marker = markers[marker_idx % len(markers)]
        marker_idx += 1
        # Plot the accumulated results for the current latent language
        plt.ylim(0.4, 1)
        plt.plot(x, y, label=latent_lang, marker=marker, linestyle='-', markersize=4, alpha=0.7)

# Add legend
plt.legend(title="PLs", loc='lower right')
plt.savefig(f"./mexa-{model_path.replace('/', '-')}-figures/mexa-{model_path.replace('/', '-')}.pdf")
plt.close()

print("Accumulated results plot saved as accumulated_results_plot.png.")