In [1]:
import pandas as pd
from csv import QUOTE_NONE
import re


In [2]:
# Define the paths to your files
english_file_path = "fr-en/europarl-v7.fr-en.en"
french_file_path = "fr-en/europarl-v7.fr-en.fr"

In [3]:
""" Function to load data into a pandas DataFrame without treating any character as quotes"""

def load_data_to_dataframe(file_path):
    # Read the entire file as a single column DataFrame, ignoring any quoting
    return pd.read_csv(file_path, header=None, names=['text'], encoding='utf-8', sep='\t', quoting=QUOTE_NONE, engine='python')

# Load the data
english_data = load_data_to_dataframe(english_file_path)
french_data = load_data_to_dataframe(french_file_path)

In [4]:
## Take 10% fraction of data randomly sampled
english_data = english_data.sample(frac=0.1, random_state=42)
french_data = french_data.sample(frac=0.1, random_state=42)


In [5]:
def preprocess_data(data_en, data_fr):
    # Lowercase the text
    """
    Normalizing case helps reduce the complexity of the language model by treating words like “The” and “the” as the same word, which can be particularly helpful in languages like English where capitalization is more stylistic than semantic.
    """
    data_en['text'] = data_en['text'].str.lower()
    data_fr['text'] = data_fr['text'].str.lower()

    # Remove XML tags
    """
    Lines containing XML-tags are likely not actual conversational or formal text but rather formatting or metadata which is irrelevant for translation purposes
    """
    data_en['text'] = data_en['text'].apply(lambda x: '' if x.strip().startswith('<') else x)
    data_fr['text'] = data_fr['text'].apply(lambda x: '' if x.strip().startswith('<') else x)

    # Strip empty lines and remove their correspondences
    """
     Empty lines or lines that do not contain any meaningful content should be removed because they do not provide valuable information for training the model. It is also important to remove the corresponding line in the other language to maintain alignment.
    """
    mask = (data_en['text'].str.strip().astype(bool) & data_fr['text'].str.strip().astype(bool))
    data_en = data_en[mask]
    data_fr = data_fr[mask]

    return data_en, data_fr



"""
Steps Not Chosen and Why:

- Removing Numbers or Special Characters: Not chosen because numbers and certain punctuation can carry semantic weight in sentences, which can be important for translations, such as dates, quantities, or formatted text.
- Stemming/Lemmatization: Not typically used in machine translation preprocessing because retaining the full form of words is important for accurate translation, especially between languages with different linguistic structures.
- Removing Stopwords: Not recommended for translation tasks because stopwords (common words like “and”, “the”, etc.) are crucial for maintaining the grammatical structure of the sentence in both source and target languages.
"""


preprocessed_en, preprocessed_fr  = preprocess_data(english_data,french_data)


  data_en = data_en[mask]
  data_fr = data_fr[mask]


In [6]:
"""
Remove all characters that are defined as noise from both translations
"""
def remove_noisy_characters(data):
    # Define the characters to remove
    noisy_characters = re.escape('@#$%^&*~<>|\\{}[]+=_/')
    
    # Regex to match any noisy character
    regex_pattern = f'[{noisy_characters}]'
    
    # Remove noisy characters using regex substitution
    data['text'] = data['text'].apply(lambda x: re.sub(regex_pattern, '', x))
    
    return data


preprocessed_en = remove_noisy_characters(english_data)
preprocessed_fr = remove_noisy_characters(french_data)


In [7]:
# TODO FIX BUG

"""
remove special characterers only if a special character appears in one translation but not in the other and vice versa
"""

def synchronize_special_characters(data_en, data_fr):
    counter = 0
    # Define the characters to synchronize
    special_characters = re.escape('@#$%^&*~<>|\\{}[]+=_/')
    
    # Regex to match any special character
    regex_pattern = f'[{special_characters}]'

    # Process each sentence pair
    for idx in range(len(data_en)):
        if idx >= len(data_fr):  # Ensure index is within the bounds for both dataframes
            break
        
        # Extract texts from both dataframes
        text_en = data_en.loc[idx, 'text']
        text_fr = data_fr.loc[idx, 'text']

        # Find special characters in both texts
        found_chars_en = set(re.findall(regex_pattern, text_en))
        found_chars_fr = set(re.findall(regex_pattern, text_fr))

        # Determine characters to remove (those not in both)
        chars_to_remove = found_chars_en.symmetric_difference(found_chars_fr)

        # Remove the special characters that do not appear in both translations
        if chars_to_remove:
            counter += 1
            remove_regex = '[' + re.escape(''.join(chars_to_remove)) + ']'
            data_en.loc[idx, 'text'] = re.sub(remove_regex, '', text_en)
            data_fr.loc[idx, 'text'] = re.sub(remove_regex, '', text_fr)
    print(counter)
    return data_en, data_fr

#data_en_sync, data_fr_sync = synchronize_special_characters(data_en, data_fr)