In [None]:
# Install dependencies
!pip install -r requirements.txt

In [None]:
import pandas as pd
import requests
import sklearn
import numpy as np
import scipy

print(f"pandas version: {pd.__version__}")
print(f"requests version: {requests.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {scipy.__version__}")

In [None]:
# Load the Nepali data from "https://github.com/oya163/nepali-ner", divide data in train, test set and put it in DATA_BASE_PATH

# Define base path for data and output
BASE_PATH = Path('/base/folder/path/')
DATA_BASE_PATH = BASE_PATH / 'nepali-data'
DATA_PREPROCESSED_PATH = DATA_BASE_PATH / 'processed'
OUTPUT_BASE_PATH = BASE_PATH / 'results'

In [None]:
import pandas as pd
from pathlib import Path
from collections import Counter

# Read an NER-tagged file, split sentences, and count total characters
def read_sentences_with_tags(input_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sentences, current_sentence = [], []
    total_characters = sum(len(line) for line in lines)

    for line in lines:
        if line.strip():
            current_sentence.append(line.strip())
        else:
            if current_sentence:
                sentences.append(current_sentence)
                current_sentence = []
    if current_sentence:
        sentences.append(current_sentence)

    print(f"Total sentences in {input_path}: {len(sentences)}")
    print(f"Total characters in {input_path}: {total_characters}")
    return sentences

# Check if a sentence contains all required NER tags
def contains_all_required_tags(sentence, required_tags):
    tag_set = {line.split()[1] for line in sentence if len(line.split()) > 1}
    return required_tags.issubset(tag_set)

# Count NER tags and measure tag diversity in a sentence
def count_and_diversify_tags_in_sentence(sentence, ner_tags):
    tag_types = {line.split()[1] for line in sentence if len(line.split()) > 1 and line.split()[1] in ner_tags}
    return len(tag_types), len(tag_types)

# Select sentences with all required tags and high tag diversity
def get_sentences_with_all_tags_and_max_diversity(sentences, required_tags, ner_tags, top_n=5):
    sentences_with_all_tags, sentences_with_diversity = [], []

    for sentence in sentences:
        tag_diversity, tag_count = count_and_diversify_tags_in_sentence(sentence, ner_tags)
        if contains_all_required_tags(sentence, required_tags):
            sentences_with_all_tags.append((tag_count, sentence))
        else:
            sentences_with_diversity.append((tag_diversity, tag_count, sentence))

    print(f"Number of sentences with all required tags: {len(sentences_with_all_tags)}")
    sentences_with_all_tags.sort(reverse=True, key=lambda x: x[0])
    sentences_with_diversity.sort(reverse=True, key=lambda x: (x[0], x[1]))

    selected_sentences = sentences_with_all_tags[:top_n]
    selected_sentence_set = set(map(lambda x: tuple(x[1]), selected_sentences))

    if len(selected_sentences) < top_n:
        remaining_slots = top_n - len(selected_sentences)
        diversity_candidates = [(tc, s) for _, tc, s in sentences_with_diversity if tuple(s) not in selected_sentence_set]
        selected_sentences += diversity_candidates[:remaining_slots]

    return selected_sentences

# Convert selected sentences to a DataFrame
def convert_sentences_to_dataframe(selected_sentences):
    words, tags = [], []
    for _, sentence in selected_sentences:
        for line in sentence:
            word, tag = line.split()
            words.append(word)
            tags.append(tag)
        words.append('')  # Separator for sentences
        tags.append('')
    return pd.DataFrame({'words': words, 'tags': tags})

# Adjust sentence endings: replace '.' with '।', move '।' to new lines, and remove duplicates
def move_sentence_end(df):
    new_rows = []
    for _, row in df.iterrows():
        word, tag = row['words'], row['tags']
        if pd.isna(word) or word.strip() == '':
            new_rows.append([word, tag])
        elif isinstance(word, str):
            if word.endswith('.'):
                word = word.rstrip('.') + '।'
            if '।' in word and word != '।':
                word_without_end = word.replace('।', '')
                if word_without_end:
                    new_rows.append([word_without_end, tag])
                new_rows.append(['।', 'O'])
            else:
                new_rows.append([word, tag])
        else:
            new_rows.append([word, tag])

    cleaned_rows = []
    for i in range(len(new_rows)):
        word, tag = new_rows[i]
        if i > 0 and word == '।' and new_rows[i - 1][0] == '।':
            continue
        cleaned_rows.append([word, tag])

    return pd.DataFrame(cleaned_rows, columns=['words', 'tags'])

# List of NER tags
ner_tags = {'B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER'}
required_tags = set(ner_tags)

print("Processing 'nepali-data'")
all_sentences = read_sentences_with_tags(DATA_BASE_PATH / 'train.txt')

# Process data for each top_n value and apply sentence-end adjustments
top_n_values = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000, 1500, 2000, 2796]
DATA_PREPROCESSED_PATH.mkdir(parents=True, exist_ok=True)

for top_n in top_n_values:
    print(f"\nProcessing top {top_n} sentences:")
    top_sentences = get_sentences_with_all_tags_and_max_diversity(all_sentences, required_tags, ner_tags, top_n)
    df = convert_sentences_to_dataframe(top_sentences)
    df_final = move_sentence_end(df)
    
    # Save final output
    output_file = DATA_PREPROCESSED_PATH / f'{top_n}_selected_sentences.csv'
    df_final.to_csv(output_file, index=False)
    print(f"Saved processed output to {output_file}")

# Process and save the test file with sentence-end adjustments
test_input_file = DATA_BASE_PATH / 'test.txt'
test_sentences = read_sentences_with_tags(test_input_file)

df_test = convert_sentences_to_dataframe([(0, s) for s in test_sentences])  # Count not needed here
df_test_final = move_sentence_end(df_test)

# Save final test output
test_output_file = DATA_PREPROCESSED_PATH / 'test_final.csv'
df_test_final.to_csv(test_output_file, index=False)
print(f"Processed test file saved to {test_output_file}")


In [None]:
import requests

def post_request_to_api(api_url, model_name, tokens, prompt):
    data = {
        "model": model_name,  # Include the model name here
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant"
            },
            {
                "role": "user",
                "content": f"{prompt}",
            }
        ],
        "max_tokens": tokens
    }
    response = requests.post(api_url, json=data, headers={"Content-Type": "application/json"})
    response = response.json()
    return response



In [None]:
# Base class for all NER experiments
class NERExperiment:
    BIO_TAG_DESCRIPTION = """
    Each word is tagged as one of the following: B-LOC, I-LOC, B-ORG, I-ORG, B-PER, I-PER, or O.
    - 'B-' marks the start of an entity.
    - 'I-' marks the continuation of the same entity.
    - Each 'I-' tag must follow a matching 'B-' tag (e.g., 'I-LOC' after 'B-LOC').
    - Correct any cases where an 'I-' tag appears without a preceding 'B-' tag.
    """

    def __init__(self, train_examples, test_sentence):
        self.train_examples = train_examples
        self.test_sentence = test_sentence

    def format_training_examples(self):
        # Format all training examples for inclusion in the prompt
        return "\n\n".join([format_sentence(s) for s in self.train_examples.values()])

    def generate_prompt(self):
        # To be implemented by subclasses
        raise NotImplementedError("Subclasses must implement generate_prompt.")

    def get_llm_response(self, api_url, model_name, tokens, prompt):
        # Send prompt to LLM API and retrieve response
        return post_request_to_api(api_url, model_name, tokens, prompt)['choices'][0]['message']['content']


# Experiment 1: NER with no training examples
class Experiment1(NERExperiment):
    def __init__(self, test_sentence):
        super().__init__(None, test_sentence)

    def generate_prompt(self):
        prompt = f"""
<context>You are an expert in identifying named entities in Nepali text, including persons, locations, and organizations.</context>
<description>{self.BIO_TAG_DESCRIPTION}</description>
<task>
<test_sentence>{self.test_sentence}</test_sentence>
Analyze the sentence and ensure each word is tagged correctly. Confirm that each 'I-' tag follows a 'B-' tag.
</task>
<output_formatting>
<tagged_output>
<pair><word>WORD</word><pred_tag>TAG</pred_tag></pair>
...
</tagged_output>
Ensure no extra text outside the tags.
</output_formatting>
"""
        return prompt


# Experiment 2: NER with training examples
class Experiment2(NERExperiment):
    def __init__(self, train_examples, test_sentence):
        super().__init__(train_examples, test_sentence)

    def generate_prompt(self):
        formatted_train_examples = self.format_training_examples()
        prompt = f"""
<context>You are an expert in identifying named entities in Nepali text. Below are training examples, followed by a test sentence.</context>
<training_examples>{formatted_train_examples}</training_examples>
<description>{self.BIO_TAG_DESCRIPTION}</description>
<task>
<test_sentence>{self.test_sentence}</test_sentence>
Analyze the sentence and ensure each word is tagged correctly. Confirm that each 'I-' tag has a preceding 'B-' tag.
</task>
<output_formatting>
<tagged_output>
<pair><word>WORD</word><pred_tag>TAG</pred_tag></pair>
...
</tagged_output>
Ensure no extra text outside the tags.
</output_formatting>
"""
        return prompt


In [None]:
import pandas as pd
from pathlib import Path
import re
import requests

# Allowed NER tags for validation
ALLOWED_TAGS = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']

# Parse LLM response to extract word-tag pairs
def parse_tagged_output(response):
    tagged_output_match = re.search(r"<tagged_output>(.*?)</tagged_output>", response, re.DOTALL)
    if tagged_output_match:
        tagged_output_content = tagged_output_match.group(1)
        word_tag_dict = {}
        pair_matches = re.findall(r"<pair>(.*?)</pair>", tagged_output_content, re.DOTALL)
        for pair in pair_matches:
            word_match = re.search(r"<word>(.*?)</word>", pair, re.DOTALL)
            pred_tag_match = re.search(r"<pred_tag>(.*?)</pred_tag>", pair, re.DOTALL)
            if word_match and pred_tag_match:
                word = word_match.group(1)
                pred_tag = pred_tag_match.group(1)
                word_tag_dict[word] = pred_tag
        return word_tag_dict
    else:
        print("No <tagged_output> found in the response.")
        return {}

# Check if all tags in the output are valid
def are_tags_valid(tagged_output):
    return all(tag in ALLOWED_TAGS for tag in tagged_output.values())

# Send prompt to LLM with retry mechanism in case of missing or invalid output
def prompt_with_retry(api_url, model_name, tokens, prompt, max_attempts=3):
    attempt_count = 0
    while attempt_count < max_attempts:
        response = post_request_to_api(api_url, model_name, tokens, prompt)['choices'][0]['message']['content']
        tagged_output = parse_tagged_output(response)
        if tagged_output and are_tags_valid(tagged_output):
            return tagged_output, attempt_count + 1, response
        attempt_count += 1
    # Log failed responses
    with open("failed_responses.txt", "a") as f:
        f.write(f"Prompt:\n{prompt}\nResponse:\n{response}\n\n")
    return None, attempt_count, response

# Format test sentence to contain only words (for LLM input)
def format_test_sentence(sentence):
    return "\n".join([f"<pair><word>{pair.split('<word>')[1].split('</word>')[0]}</word></pair>" for pair in sentence])

# Format sentence with NER tags for LLM training examples
def format_sentence(sentence):
    return "<ner_tagged_sentence>\n" + "\n".join(sentence) + "\n</ner_tagged_sentence>"

# Run an experiment and save NER predictions to a CSV file
def run_experiment_and_save_predictions(experiment_class, train_examples, test_sentence, api_url, model_name, tokens, output_csv_file, index):
    formatted_test_sentence = format_test_sentence(test_sentence)
    if experiment_class in [Experiment1, Experiment2]:
        experiment = experiment_class(formatted_test_sentence)
    else:
        experiment = experiment_class(train_examples, formatted_test_sentence)

    prompt = experiment.generate_prompt()
    tagged_output, attempt_count, response = prompt_with_retry(api_url, model_name, tokens, prompt)

    # Create dictionary of words and predicted tags
    if tagged_output is None:
        word_tag_dict = {word: "" for word in re.findall(r"<word>(.*?)</word>", formatted_test_sentence, re.DOTALL)}
    else:
        word_tag_dict = tagged_output

    # Convert word-tag dictionary to DataFrame and save
    word_tag_df = pd.DataFrame(list(word_tag_dict.items()), columns=['words', 'pred_tags'])
    word_tag_df['sentence_index'] = index
    word_tag_df['llm_prompt_count'] = attempt_count
    word_tag_df['llm_response'] = ""
    word_tag_df.at[0, 'llm_response'] = response  # Save response in the first row

    empty_row = pd.DataFrame([["", "", "", "", ""]], columns=['words', 'pred_tags', 'sentence_index', 'llm_prompt_count', 'llm_response'])

    # Append results and an empty row to the CSV file
    if not output_csv_file.exists():
        word_tag_df.to_csv(output_csv_file, index=False, mode='w', header=True)
        empty_row.to_csv(output_csv_file, index=False, mode='a', header=False)
    else:
        word_tag_df.to_csv(output_csv_file, index=False, mode='a', header=False)
        empty_row.to_csv(output_csv_file, index=False, mode='a', header=False)
    print(f'Saved output for sentence index {index}')

# Extract sentences from DataFrame for LLM input
def extract_sentences_from_dataframe(df):
    sentences = {}
    current_sentence = []
    sentence_index = 0

    for _, row in df.iterrows():
        word = row['words']
        tag = row['tags']

        if pd.isna(word):
            if current_sentence:
                sentences[sentence_index] = current_sentence
                current_sentence = []
                sentence_index += 1
        else:
            current_sentence.append(f"<pair><word>{word}</word><pred_tag>{tag}</pred_tag></pair>")

    if current_sentence:
        sentences[sentence_index] = current_sentence
            
    return sentences

# Training data sizes to use in experiments
training_data_sizes = [50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000, 1500, 2000, 2796]

# Run experiments with specified training data sizes
def main():
    api_url = "model's api"
    model_name = "your model name"
    tokens = 3000

    for num_of_training_data in training_data_sizes:
        print(f"Running experiments with {num_of_training_data} training examples.")

        # Load train and test data for Nepali NER
        nepali_train_df = pd.read_csv(DATA_PREPROCESSED_PATH / f'{num_of_training_data}_selected_sentences_final.csv')
        nepali_test_df = pd.read_csv(DATA_PREPROCESSED_PATH / 'test_final.csv')

        # Extract sentences for experiment input
        nepali_train_sentences = extract_sentences_from_dataframe(nepali_train_df)
        nepali_test_sentences = extract_sentences_from_dataframe(nepali_test_df)

        # Experiment configurations
        experiment_data = {
            1: {
                'train_examples': None,  # No training data required
                'test_examples': nepali_test_sentences,
                'experiment_class': Experiment1,
                'output_file': OUTPUT_BASE_PATH / f'experiment_1_predictions_{num_of_training_data}.csv'
            },
            2: {
                'train_examples': nepali_train_sentences,
                'test_examples': nepali_test_sentences,
                'experiment_class': Experiment2,
                'output_file': OUTPUT_BASE_PATH / f'experiment_2_predictions_{num_of_training_data}.csv'
            }
        }

        # Run each experiment
        for exp_num, exp_data in experiment_data.items():
            train_examples = exp_data['train_examples']
            test_sentences = exp_data['test_examples']
            experiment_class = exp_data['experiment_class']
            output_csv_file = exp_data['output_file']

            total_sentences = len(test_sentences)
            print(f"Total test sentences for Experiment {exp_num}: {total_sentences}")

            # Save predictions for each sentence
            for index, test_sentence in test_sentences.items():
                if test_sentence:
                    run_experiment_and_save_predictions(
                        experiment_class, train_examples, test_sentence,
                        api_url, model_name, tokens, output_csv_file, index
                    )
                else:
                    print(f"Sentence index {index} not found in test sentences.")

if __name__ == "__main__":
    main()


In [None]:
import os
import pandas as pd

# Define paths for output directory
output_folder = os.path.join(OUTPUT_BASE_PATH, 'empty_pred_tags_results')

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process each CSV file in the results folder
for filename in os.listdir(OUTPUT_BASE_PATH):
    if filename.endswith('.csv'):
        # Load CSV data into a DataFrame
        file_path = os.path.join(OUTPUT_BASE_PATH, filename)
        df = pd.read_csv(file_path)

        print(f"Processing file: '{filename}'")

        # Fill NaN values with empty strings for consistent handling
        df = df.fillna('')

        # Track the current sentence index for clearing repeated 'llm_prompt_count' values
        current_sentence_index = None

        # Clear repeated 'llm_prompt_count' values within the same sentence
        for index, row in df.iterrows():
            # Identify new sentence boundaries
            if row['sentence_index'] != current_sentence_index:
                current_sentence_index = row['sentence_index']
            else:
                # Set 'llm_prompt_count' to empty for rows within the same sentence
                df.at[index, 'llm_prompt_count'] = ''

        # Save the modified DataFrame to a new CSV file
        modified_output_path = os.path.join(output_folder, f'modified_{filename}')
        df.to_csv(modified_output_path, index=False)
        print(f"Processed file saved as '{modified_output_path}'.")

        # Count unique values in 'llm_prompt_count' after removing empty entries
        non_empty_counts = df['llm_prompt_count'][df['llm_prompt_count'] != '']
        value_counts = non_empty_counts.value_counts()
        print("Occurrences of each unique 'llm_prompt_count' value:")
        print(value_counts)

        # Save the counts to a CSV file
        output_counts_path = os.path.join(output_folder, f'llm_prompt_count_repeats_{filename}')
        value_counts.to_csv(output_counts_path, header=['Count'])
        print(f"Counts of 'llm_prompt_count' values saved as '{output_counts_path}'.")

        # Filter rows where 'llm_prompt_count' is 3 and 'pred_tags' is empty
        prompt_3_df = df[df['llm_prompt_count'] == 3]
        empty_pred_tags_df = prompt_3_df[(prompt_3_df['pred_tags'].isna()) | (prompt_3_df['pred_tags'] == '')]

        # Identify unique sentence indices with empty 'pred_tags' where 'llm_prompt_count' is 3
        sentence_indices_with_empty_pred_tags = empty_pred_tags_df['sentence_index'].unique()

        # Extract all rows for sentences with empty 'pred_tags' and save to a new file
        full_sentences_df = df[df['sentence_index'].isin(sentence_indices_with_empty_pred_tags)]
        empty_pred_tags_count = len(empty_pred_tags_df)
        print(f"Number of times 'pred_tags' is empty for 'llm_prompt_count' = 3: {empty_pred_tags_count}")

        # Save sentences with empty 'pred_tags' to a new CSV
        output_empty_pred_tags_path = os.path.join(output_folder, f'empty_pred_tags_sentences_{filename}')
        full_sentences_df.to_csv(output_empty_pred_tags_path, index=False)
        print(f"Saved sentences with 'llm_prompt_count' = 3 and empty 'pred_tags' to '{output_empty_pred_tags_path}'.")


In [None]:
import os
import pandas as pd
import math
import difflib
import csv

# Allowed NER tags for validation
ALLOWED_TAGS = ['B-LOC', 'I-LOC', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'O']

# Check if all tags in a list are valid
def tags_are_allowed(tags):
    return all(tag in ALLOWED_TAGS for tag in tags)

# Extract sentences and tags from DataFrame columns
def extract_sentences_and_tags(df, words_column, tags_column):
    sentences, tags, current_sentence, current_tag = [], [], [], []
    
    for _, row in df.iterrows():
        if isinstance(row[words_column], float) and math.isnan(row[words_column]):  # NaN separates sentences
            if current_sentence:
                sentences.append(current_sentence)
                tags.append(current_tag)
                current_sentence, current_tag = [], []  # Reset for next sentence
        else:
            current_sentence.append(row[words_column])
            current_tag.append(row[tags_column])
    
    # Append the last sentence if present
    if current_sentence:
        sentences.append(current_sentence)
        tags.append(current_tag)
    
    return sentences, tags

# Calculate similarity between two sentences based on word sequence
def sentence_similarity(sent1, sent2):
    return difflib.SequenceMatcher(None, sent1, sent2).ratio()

# Align sentences and save to CSV with both tags and predicted tags columns
def save_aligned_sentences_to_csv(sentences_test, tags_test, sentences_pred, tags_pred, threshold=0.8, output_file='aligned_sentences.csv'):
    aligned_count = 0
    unmatched_test, unmatched_tags_test = sentences_test.copy(), tags_test.copy()
    unmatched_pred, unmatched_tags_pred = sentences_pred.copy(), tags_pred.copy()

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Test Sentence', 'Predicted Sentence', 'Test Tags', 'Predicted Tags'])
        
        # Align test sentences with predicted sentences based on similarity threshold
        for i, test_sent in enumerate(sentences_test):
            for j, pred_sent in enumerate(sentences_pred):
                similarity = sentence_similarity(test_sent, pred_sent)
                
                # Save aligned pairs if similarity threshold and tag validation are met
                if similarity >= threshold and tags_are_allowed(tags_pred[j]):
                    aligned_test, aligned_pred, aligned_tags_test, aligned_tags_pred = align_sentences_tags(
                        test_sent, pred_sent, tags_test[i], tags_pred[j])
                    
                    for t_word, p_word, t_tag, p_tag in zip(aligned_test, aligned_pred, aligned_tags_test, aligned_tags_pred):
                        writer.writerow([t_word, p_word, t_tag, p_tag])
                    writer.writerow([])  # Empty row between aligned pairs

                    unmatched_test.remove(test_sent)
                    unmatched_tags_test.remove(tags_test[i])
                    unmatched_pred.remove(pred_sent)
                    unmatched_tags_pred.remove(tags_pred[j])

                    aligned_count += 1
                    break  # Move to next test sentence after alignment
                elif not tags_are_allowed(tags_pred[j]):
                    unaligned_output = output_file.replace('aligned', 'unaligned')
                    save_unaligned_sentence_to_csv(test_sent, tags_test[i], pred_sent, tags_pred[j], unaligned_output)
    
    print(f"Aligned sentences: {aligned_count}")
    print(f"Unmatched sentences in test set: {len(unmatched_test)}")
    print(f"Unmatched sentences in predictions: {len(unmatched_pred)}")

    # Save unaligned sentences in order of similarity
    save_unaligned_analysis(unmatched_test, unmatched_tags_test, unmatched_pred, unmatched_tags_pred, output_file=output_file.replace('aligned', 'unaligned'))

# Save unaligned sentences to a CSV file
def save_unaligned_sentence_to_csv(test_sent, test_tags, pred_sent, pred_tags, output_file):
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Test Sentence', 'Predicted Sentence', 'Test Tags', 'Predicted Tags'])
        for t_word, p_word, t_tag, p_tag in zip(test_sent, pred_sent, test_tags, pred_tags):
            writer.writerow([t_word, p_word, t_tag, p_tag])
        writer.writerow([])

# Align sentences and tags by matching words, filling unmatched words with blanks
def align_sentences_tags(sent1, sent2, tags1, tags2):
    matcher = difflib.SequenceMatcher(None, sent1, sent2)
    aligned_sent1, aligned_sent2, aligned_tags1, aligned_tags2 = [], [], [], []
    
    for opcode, i1, i2, j1, j2 in matcher.get_opcodes():
        if opcode == 'equal':
            aligned_sent1.extend(sent1[i1:i2])
            aligned_sent2.extend(sent2[j1:j2])
            aligned_tags1.extend(tags1[i1:i2])
            aligned_tags2.extend(tags2[j1:j2])
        elif opcode == 'insert':
            aligned_sent1.extend([''] * (j2 - j1))
            aligned_sent2.extend(sent2[j1:j2])
            aligned_tags1.extend([''] * (j2 - j1))
            aligned_tags2.extend(tags2[j1:j2])
        elif opcode == 'delete':
            aligned_sent1.extend(sent1[i1:i2])
            aligned_sent2.extend([''] * (i2 - i1))
            aligned_tags1.extend(tags1[i1:i2])
            aligned_tags2.extend([''] * (i2 - i1))
        elif opcode == 'replace':
            aligned_sent1.extend(sent1[i1:i2])
            aligned_sent2.extend(sent2[j1:j2])
            aligned_tags1.extend(tags1[i1:i2])
            aligned_tags2.extend(tags2[j1:j2])
    
    return aligned_sent1, aligned_sent2, aligned_tags1, aligned_tags2

# Analyze and save unaligned sentences by descending similarity
def save_unaligned_analysis(unmatched_test, unmatched_tags_test, unmatched_pred, unmatched_tags_pred, output_file='unaligned_sentences.csv'):
    unaligned_pairs = []
    
    # Calculate similarity for unaligned sentences
    for i, test_sent in enumerate(unmatched_test):
        for j, pred_sent in enumerate(unmatched_pred):
            if i >= len(unmatched_tags_test) or j >= len(unmatched_tags_pred):
                continue
            similarity = sentence_similarity(test_sent, pred_sent)
            if similarity > 0:
                unaligned_pairs.append((test_sent, pred_sent, unmatched_tags_test[i], unmatched_tags_pred[j], similarity))
    
    # Sort pairs by similarity and save to CSV
    unaligned_pairs.sort(key=lambda x: x[4], reverse=True)

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Test Sentence', 'Predicted Sentence', 'Test Tags', 'Predicted Tags'])
        
        for test_sent, pred_sent, test_tags, pred_tags, similarity in unaligned_pairs:
            aligned_test, aligned_pred, aligned_tags_test, aligned_tags_pred = align_sentences_tags(test_sent, pred_sent, test_tags, pred_tags)
            for t_word, p_word, t_tag, p_tag in zip(aligned_test, aligned_pred, aligned_tags_test, aligned_tags_pred):
                writer.writerow([t_word, p_word, t_tag, p_tag])
            writer.writerow([])

    print(f"Unaligned sentences sorted by similarity saved to {output_file}.")

# Folder paths for aligned and unaligned output
aligned_folder = os.path.join(OUTPUT_BASE_PATH, 'aligned/')
unaligned_folder = os.path.join(OUTPUT_BASE_PATH, 'unaligned/')

# Create output folders if they don't exist
os.makedirs(aligned_folder, exist_ok=True)
os.makedirs(unaligned_folder, exist_ok=True)

# Load test file and extract sentences and tags
test_file = pd.read_csv("/home/jovyan/transfer_learning/data/nepali-data/test_mapped_updated.csv")
sentences_test, tags_test = extract_sentences_and_tags(test_file, 'words', 'tags')

# Process each results file in the results folder
for file_name in os.listdir(OUTPUT_BASE_PATH):
    if file_name.endswith('.csv') and ('experiment_4' in file_name or 'experiment_2' in file_name):
        results_path = os.path.join(OUTPUT_BASE_PATH, file_name)
        results = pd.read_csv(results_path)
        
        # Extract sentences and predicted tags from results
        sentences_pred, tags_pred = extract_sentences_and_tags(results, 'words', 'pred_tags')

        # Save aligned and unaligned sentences
        aligned_output = os.path.join(aligned_folder, f'aligned_{file_name}')
        unaligned_output = os.path.join(unaligned_folder, f'unaligned_{file_name}')
        save_aligned_sentences_to_csv(sentences_test, tags_test, sentences_pred, tags_pred, threshold=0.8, output_file=aligned_output)


In [None]:
import os
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
import numpy as np

# Function to compute and print evaluation metrics for each CSV file in a folder
def compute_metrics_for_each_file(folder_path):
    # Process each CSV file in the specified folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            
            # Remove rows with missing values in 'Predicted Tags' or 'Test Tags'
            df = df.dropna(subset=['Predicted Tags', 'Test Tags'])

            # Extract true and predicted tags from the DataFrame
            true_tags = df['Test Tags'].values
            pred_tags = df['Predicted Tags'].values

            # Exclude 'O' tags (non-entity tokens) from evaluation
            mask = true_tags != 'O'
            true_tags = true_tags[mask]
            pred_tags = pred_tags[mask]

            # Calculate weighted precision, recall, and F1 score with zero division handling
            precision, recall, f1, _ = precision_recall_fscore_support(true_tags, pred_tags, average='weighted', zero_division=0)
            conf_matrix = confusion_matrix(true_tags, pred_tags, labels=np.unique(true_tags))

            # Display metrics for the current file
            print(f"Metrics for {file_name}:")
            print("Classification Report:")
            print(classification_report(true_tags, pred_tags, labels=np.unique(true_tags), zero_division=0))
            print("\nConfusion Matrix:")
            print(conf_matrix)
            print(f"\nOverall Precision: {precision:.4f}")
            print(f"Overall Recall: {recall:.4f}")
            print(f"Overall F1 Score: {f1:.4f}\n")
            print("=" * 50)  # Divider for readability

# Run the metric computation function on the specified folder
compute_metrics_for_each_file(aligned_folder)


In [None]:
import numpy as np
import os
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support
from scipy.stats import kruskal

# Initialize lists to store results for each metric across all files
all_f1_scores = []
file_names = []

# Function to load and process each file, compute metrics
def compute_metrics_for_each_file(folder_path):
    # Iterate over each CSV file in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            
            # Filter out rows with missing 'Predicted Tags' or 'Test Tags'
            df = df.dropna(subset=['Predicted Tags', 'Test Tags'])

            # Extract true and predicted tags for the file
            true_tags = df['Test Tags'].values
            pred_tags = df['Predicted Tags'].values

            # Filter out 'O' tags
            mask = true_tags != 'O'
            true_tags = true_tags[mask]
            pred_tags = pred_tags[mask]

            # Calculate precision, recall, F1 for each unique tag, excluding 'O'
            labels = np.unique(true_tags)
            _, _, f1, _ = precision_recall_fscore_support(true_tags, pred_tags, labels=labels, average='weighted',zero_division=0)

            # Append the list of scores per tag for each file (multiple values per group for Kruskal-Wallis)
            all_f1_scores.append(f1)
            file_names.append(file_name)

# Run the function on the specified folder
compute_metrics_for_each_file(aligned_folder)

# Check if there are at least two groups for the Kruskal-Wallis test
if len(all_precisions) < 2 or len(all_recalls) < 2 or len(all_f1_scores) < 2:
    print("Not enough groups to perform Kruskal-Wallis test. Ensure at least two CSV files are in the specified folder.")
else:
    
    # Flatten the lists to compare values across files
    f1_test = kruskal(*all_f1_scores)

    # Display the results
    print("Kruskal-Wallis Test Results:")
    print(f"F1 Score - p-value: {f1_test.pvalue:.4f}")

    # Interpret results
    if f1_test.pvalue < 0.05:
        print("\nThere is a statistically significant difference in F1 scores among the models.")
    else:
        print("\nNo statistically significant difference in F1 scores among the models.")
