In [1]:
import pandas as pd
import re

In [25]:
def is_amharic(text):
    """
    Check if text contains Amharic characters.
    Amharic Unicode range: 0x1200-0x137F (Ethiopic)
    """
    if pd.isna(text):
        return False
    amharic_pattern = re.compile(r'[\u1200-\u137F]')
    return bool(amharic_pattern.search(str(text)))

def filter_non_amharic_rows(csv_path):
    """
    Filter out rows containing Amharic text and save to a new CSV.
    Returns the filtered DataFrame.
    """
    # Read the original CSV
    df = pd.read_csv(csv_path)

    # Replace the path prefix
    df['images'] = df['ocr'].str.replace('/data/local-files/?d=', './')

    # Replace NaN values with empty string in genre column
    df['text'] = df['genre'].fillna('')

    # Create a mask for non-Amharic rows
    non_amharic_mask = ~df['genre'].apply(is_amharic)

    # Filter the DataFrame
    df_english = df[non_amharic_mask].copy()

    df_english = df_english[df_english['text'].notna() & (df_english['text'] != '')]
    df_english = df_english[["text", "images"]]
    # Save to new CSV
    df_english.to_csv('labeled_english.csv', index=False)

    # Print statistics
    total_rows = len(df)
    english_rows = len(df_english)
    amharic_rows = total_rows - english_rows

    print(f"Original number of rows: {total_rows}")
    print(f"Rows with Amharic text: {amharic_rows}")
    print(f"Rows without Amharic text: {english_rows}")
    print("Filtered data saved to 'labeled_english.csv'")


    return df_english

In [26]:
df_english = filter_non_amharic_rows("/home/admin/blessed/HandWritten_Amharic_English_OCR/datasets/Labeled dummy regions/project-1-at-2024-12-19-10-47-bbac5eeb.csv")

Original number of rows: 615
Rows with Amharic text: 274
Rows without Amharic text: 341
Filtered data saved to 'labeled_english.csv'


In [27]:
df_english.head()

Unnamed: 0,text,images
0,52,./regions/1000020569_1.png
1,909121212,./regions/1000020569_10.png
5,04/01/1972,./regions/1000020569_14.png
7,Addis Ababa,./regions/1000020569_3.png
9,Hailemeskel Gebreegziabher kidane,./regions/1000020569_5.png
