In [None]:
import pandas as pd
import os
import re

# --- Configuration ---
raw_csv_path = '../data/raw/telegram_messages.csv'

original_message_column_name = 'text'

processed_scraped_csv_path = '../data/processed/cleaned_telegram_data.csv'

# --- Define the Cleaning Function ---
def clean_amharic_text(text):
    """
    Cleans Amharic text by removing URLs, mentions, hashtags,
    extra spaces, and specific Ethiopian punctuation/characters.
    Handles non-string inputs by converting them to string.
    """
    if not isinstance(text, str):
        text = str(text)

    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)

    #  Remove specific characters that look like 'O' with surrounding spaces.
    text = re.sub(r'\s+O\s*', ' ', text) # Remove ' O ', 'O ', ' O'
    text = re.sub(r'^O\s*|\s*O$', '', text) # Remove 'O' at start/end of string with optional space

    # 5. Remove Ethiopian punctuation marks (U+1361 to U+1368)
    text = re.sub(r'[\u1361-\u1368]', ' ', text)

    # 6. Normalize whitespace: replace multiple spaces with single space and strip leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# --- Load Raw Data ---
df_scraped = None # Initialize df_scraped to None

try:
    print(f"Attempting to load raw data from: {raw_csv_path}")
    df_scraped = pd.read_csv(raw_csv_path, encoding='utf-8')
    print(f"Successfully loaded {len(df_scraped)} rows.")

    print("\n--- Loaded DataFrame Info ---")
    print("Columns:", df_scraped.columns.tolist())
    print("First 5 rows:\n", df_scraped.head())
    print("-----------------------------\n")

except FileNotFoundError:
    print(f"Error: Raw data file not found at '{raw_csv_path}'. Please ensure the file exists.")
    print("Exiting data processing. No cleaning or saving will be performed.")
except pd.errors.EmptyDataError:
    print(f"Error: The file at '{raw_csv_path}' is empty. No data to process.")
except Exception as e:
    print(f"An unexpected error occurred while loading raw data: {e}")
    print("Exiting data processing. No cleaning or saving will be performed.")


# --- Perform Data Cleaning and Create 'Cleaned_Message' Column (if df_scraped loaded) ---
if df_scraped is not None and not df_scraped.empty:
    if original_message_column_name in df_scraped.columns:
        print(f"Cleaning '{original_message_column_name}' column and creating 'Cleaned_Message'...")
        df_scraped['Cleaned_Message'] = df_scraped[original_message_column_name].fillna('').apply(clean_amharic_text)
        print("Cleaned 'Message' column and created 'Cleaned_Message'.")

        print("\nSample of cleaned messages (Original vs. Cleaned):")
        num_samples = min(5, len(df_scraped))
        if num_samples > 0:
            for i, row in df_scraped.sample(num_samples).iterrows():
                print(f"Original: {row[original_message_column_name]}")
                print(f"Cleaned:  {row['Cleaned_Message']}\n")
        else:
            print("DataFrame is empty, no samples to show after cleaning.")

    else:
        print(f"Error: Original message column '{original_message_column_name}' not found in loaded DataFrame.")
        print("Please check the 'original_message_column_name' variable in the script and your CSV file's headers.")
        print("Cannot perform cleaning or save cleaned data.")
        df_scraped = None 
else:
    print("No data loaded or DataFrame is empty. Skipping cleaning steps.")


# --- Save Cleaned Data (if cleaning was successful) ---
# Ensure the target directory exists
os.makedirs(os.path.dirname(processed_scraped_csv_path), exist_ok=True)

if df_scraped is not None and 'Cleaned_Message' in df_scraped.columns:
    try:
        df_scraped.to_csv(processed_scraped_csv_path, index=False, encoding='utf-8')
        print(f"\nCleaned scraped data saved successfully to: {processed_scraped_csv_path}")
    except Exception as e:
        print(f"\nError: Could not save cleaned data to {processed_scraped_csv_path}. Reason: {e}")
else:
    print("\nSkipping saving of cleaned data because 'df_scraped' is not ready or 'Cleaned_Message' column is missing.")

# --- Final Status Update (regardless of data processing success) ---
print("\nLabeled data is parsed and ready in 'labeled_sentences' variable. (Note: This refers to another part of your script not shown here.)")
print("This parsed labeled data ('labeled_sentences') is prepared for further analysis.")

Attempting to load raw data from: ../data/raw/telegram_messages.csv
Successfully loaded 2359 rows.

--- Loaded DataFrame Info ---
Columns: ['channel', 'message_id', 'sender_id', 'timestamp', 'text']
First 5 rows:
                   channel  message_id      sender_id  \
0  ethio_brand_collection        6117 -1001149977975   
1  ethio_brand_collection        6116 -1001149977975   
2  ethio_brand_collection        6115 -1001149977975   
3  ethio_brand_collection        6113 -1001149977975   
4  ethio_brand_collection        6112 -1001149977975   

                   timestamp  \
0  2025-06-22T06:27:39+00:00   
1  2025-06-16T09:01:34+00:00   
2  2025-06-15T09:20:06+00:00   
3  2025-06-14T09:04:17+00:00   
4  2025-06-14T06:40:06+00:00   

                                                text  
0  ‼️ እሁድ ሁሌም ክፍት ነን ‼️  Reebok Club Vintage    s...  
1  Skechers archfit  size 40,41,42,43 Price 3400 ...  
2  ‼️ እሁድ ሁሌም ክፍት ነን ‼️  NB 04 leather   Size 39,...  
3  Nike Air Force Paisley  Size 40,4