<a href="https://colab.research.google.com/github/dsynderg/479_Final_Project/blob/main/mtevalpreparer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import itertools

def tabmaker(listofFiles, output_filename="combined_output.tsv"):
    """
    Reads content from a list of files, combining corresponding lines from each file
    into a single line in a TSV file, separated by tabs.

    Args:
        listofFiles (list): A list of file paths to process.
        output_filename (str): The name of the output TSV file.
    """
    open_files = []
    try:
        for file_path in listofFiles:
            try:
                open_files.append(open(file_path, 'r', encoding='utf-8'))
            except FileNotFoundError:
                print(f"Warning: File not found at {file_path}. Skipping.")

        if not open_files:
            print("No valid files found to combine.")
            return

        with open(output_filename, 'w', encoding='utf-8') as outfile:
            for lines in itertools.zip_longest(*open_files, fillvalue=''):
                # Strip whitespace from each line and join them with a tab
                combined_line = '\t'.join(line.strip() for line in lines)
                outfile.write(combined_line + '\n')
        print(f"Combined content from {len(open_files)} files written to {output_filename}")

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        for f in open_files:
            f.close()

In [1]:
def read_two_column_tsv_to_list_of_lists(file_path):
    """
    Reads a two-column TSV file and returns its content as a list of lists.
    Each inner list contains two items, corresponding to the two columns of a line.

    Args:
        file_path (str): The path to the two-column TSV file.

    Returns:
        list: A list of lists, where each inner list contains two strings (the column values).
              Returns an empty list if the file is not found or no valid data is read.
    """
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    data.append([parts[0].strip(), parts[1].strip()])
                else:
                    print(f"Warning: Skipping line '{line.strip()}' in '{file_path}' as it does not contain at least two tab-separated columns.")
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'.")
    except Exception as e:
        print(f"An error occurred while reading '{file_path}': {e}")
    return data

In [12]:
spanishlines = read_two_column_tsv_to_list_of_lists('/content/30_human_Sentences_spanish.txt')

In [13]:
print(spanishlines[0])

["<Spanish> President Henry B. Eyring, 'Hearts Bound Together,' Ensign, May 2005", "Presidente Henry B. Eyring, 'Teniendo entrelazados sus corazones', Liahona, mayo de 2005"]


In [22]:
def write_second_elements_to_file(list_of_lists, output_file_path):
    """
    Extracts the second element from each inner list in a list of lists
    and writes them to a specified text file, each on a new line.

    Args:
        list_of_lists (list): A list where each element is itself a list.
        output_file_path (str): The path to the output text file.
    """
    try:
        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            for item in list_of_lists:
                if len(item) > 1: # Ensure there is a second element
                    outfile.write(item[1] + '\n')
                else:
                    print(f"Warning: Skipping item '{item}' as it does not contain a second element.")
        print(f"Second elements written to {output_file_path}")
    except Exception as e:
        print(f"An error occurred while writing to file: {e}")

In [15]:
write_second_elements_to_file(spanishlines,'englishTranslationsForSpanish.txt')

Second elements written to englishTranslationsForSpanish.txt


In [16]:
listOfSpanishTranslations = [
    '/content/englishTranslationsForSpanish.txt',
    '/content/spanish_output_complete',
    '/content/spanish_output_zeroShot',
    '/content/spa_output_omni'

]

In [17]:
tabmaker(listOfSpanishTranslations,'SpanishMtEval.tsv')

Combined content from 4 files written to SpanishMtEval.tsv


In [18]:
portugueselines = read_two_column_tsv_to_list_of_lists('/content/portuguese_human_evaluation.txt')

In [19]:
write_second_elements_to_file(portugueselines,'englishTranslationsForportu.txt')

Second elements written to englishTranslationsForportu.txt


In [20]:
listOfSpanishTranslations = [
    '/content/englishTranslationsForportu.txt',
    '/content/portuguese_output_complete',
    '/content/portuguese_output_zeroShot',
    '/content/por_output_omni'

]

In [21]:
tabmaker(listOfSpanishTranslations,'PortugueseMtEval.tsv')

Combined content from 4 files written to PortugueseMtEval.tsv


In [25]:
paplines = read_two_column_tsv_to_list_of_lists('/content/papiamento_human_translation.txt')

In [26]:
write_second_elements_to_file(paplines,'englishTranslationsForpap.txt')

Second elements written to englishTranslationsForpap.txt


In [27]:
listOfSpanishTranslations = [
    '/content/englishTranslationsForpap.txt',
    '/content/papiamento_output_complete',
    '/content/papiamento_output_zeroShot',
    '/content/pap_output_omni'

]

In [28]:
tabmaker(listOfSpanishTranslations,'PapiamentoMtEval.tsv')

Combined content from 4 files written to PapiamentoMtEval.tsv
