In [23]:
import pandas as pd

# Load TSV files
enam = pd.read_csv('../dataset/enam.tsv', sep='\t' , on_bad_lines='skip')
amen = pd.read_csv('../dataset/amen.tsv', sep='\t', on_bad_lines='skip')

print(enam.head())
print(amen.head())



  Domain                                             Source  \
0   news  An 11-year-old student opens fire at his schoo...   
1   news  Rock musician Ozzy Osbourne announces that he ...   
2   news  After a 2,000-point drop on Monday, the DJIA f...   
3   news  Italy confirms that an Italian man has contrac...   
4   news  The EF3 tornado that went through Davidson, Wi...   

                               Reference Translation  \
0  አንድ የ11 ዓመት ተማሪ Torreón፣ Coahuila፣ ሜክሲኮ በሚገኝ ት...   
1         የሮክ ሙዚቀኛ ኦዚ ኦስቦርን በፓርክንሰን በሽታ መያዙን አስታዉቋል።   
2  ሰኞ ቀን ላይ 2,000 ነጥብ ከጣለ በኋላ፣ ከNYSE የመክፈቻ ደወል በፊ...   
3  አንድ ጣሊያናዊ የሴት ጓደኛዉን ዉሃን ዉስጥ ከጎበኛት በኋላ በቫይረሱ መያ...   
4  በዴቪድሰን፣ ዊልሰን፣ እና ስሚዝ አዉራጃዎች በኩል ያለፈው EF3 ሀይለኛ ...   

                                    Google Translate  \
0  አንድ የ 11 ዓመት ልጅ በሜክሲኮ ፣ ቶዋሪ ፣ ኮዋይላ ፣ ሜክሲኮ ውስጥ ...   
1  የሮክ ሙዚቃ ባለሙያው ኦዚ ኦስቦርን በፓርኪንሰን በሽታ መያዙን አስታውቋል ፡፡   
2  ሰኞ ከ 2,000 ነጥብ ከወደቀ በኋላ የዲጄሪያ የወደፊት ዕጣ ፈንታ ከ N...   
3  በቻን ውስጥ የሴት ጓደኛዋን ከጎበኘ በኋላ አንድ ጣሊያናዊ ሰው ቫይረሱ እ...   
4  በ

## Preprocess the Data


### Clean the Text

In [24]:
import re

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'<.*?>', '', text)  # Remove HTML
    text = re.sub(r'[^\w\s፡።፣፤፥፦፧፨’‘“”".,!?\'\-]', '', text)  # Keep only meaningful characters
    text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
    return text.strip()

# Clean enam.tsv (English source, Amharic targets)
enam['Source'] = enam['Source'].apply(clean_text)
enam['Reference Translation'] = enam['Reference Translation'].apply(clean_text)
enam['Google Translate'] = enam['Google Translate'].apply(clean_text)
enam['Yandex Translate'] = enam['Yandex Translate'].apply(clean_text)

# Clean amen.tsv (Amharic source, English targets)
amen['Source Sentence'] = amen['Source Sentence'].apply(clean_text)
amen['Reference Translation'] = amen['Reference Translation'].apply(clean_text)
amen['Google Translate'] = amen['Google Translate'].apply(clean_text)
amen['Yandex Translate'] = amen['Yandex Translate'].apply(clean_text)


### Normalize


In [25]:
import unicodedata

def normalize_amharic(text):
    return unicodedata.normalize('NFKC', text)

def lowercase_english(text):
    return text.lower()

# Normalize enam.tsv
enam['Source'] = enam['Source'].apply(lowercase_english)
for col in ['Reference Translation', 'Google Translate', 'Yandex Translate']:
    enam[col] = enam[col].apply(normalize_amharic)

# Normalize amen.tsv
amen['Source Sentence'] = amen['Source Sentence'].apply(normalize_amharic)
for col in ['Reference Translation', 'Google Translate', 'Yandex Translate']:
    amen[col] = amen[col].apply(lowercase_english)


### Filter Sentence Pairs

In [26]:
def valid_length(src, tgt, min_len=3, max_len=100):
    return min_len <= len(src.split()) <= max_len and min_len <= len(tgt.split()) <= max_len

# For enam.tsv: English (src) → Amharic (tgt)
enam_filtered = enam[
    enam.apply(lambda row: valid_length(row['Source'], row['Reference Translation']), axis=1)
]

# For amen.tsv: Amharic (src) → English (tgt)
amen_filtered = amen[
    amen.apply(lambda row: valid_length(row['Source Sentence'], row['Reference Translation']), axis=1)
]


## Save the Cleaned Dataset

In [None]:
# Save cleaned English → Amharic
enam_filtered[['Source', 'Reference Translation']].to_csv(
    '../dataset/cleaned/enam_cleaned.tsv', sep='\t', index=False, header=False
)

# Save cleaned Amharic → English
amen_filtered[['Source Sentence', 'Reference Translation']].to_csv(
    '../dataset/cleaned/amen_cleaned.tsv', sep='\t', index=False, header=False
)
