In [None]:
import pandas as pd
import re
import os
from collections import defaultdict

print("Libraries imported successfully!")

In [None]:
scraped_csv_path = '../data/raw/telegram_data.csv'

try:
    df_scraped = pd.read_csv(../data/raw/telegram_data.csv, encoding='utf-8')
    print(f"Loaded scraped data from: {../data/raw/telegram_data.csv}")
    print(f"Shape of scraped data: {df_scraped.shape}")
    print("\nFirst 5 rows of scraped data:")
    print(df_scraped.head())
    print("\nColumn information:")
    print(df_scraped.info())
except FileNotFoundError:
    print(f"Error: The file '{scraped_csv_path}' was not found.")
    print("Please ensure the 'telegram_scraper.py' script was run successfully and created the CSV.")
except Exception as e:
    print(f"An error occurred while loading the scraped CSV: {e}")

In [None]:
def clean_amharic_text(text):
    if not isinstance(text, str):
        return "" # Return empty string for non-string types (e.g., NaN)

    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\s+O\s*', ' ', text) 
    text = re.sub(r'O\s+', '', text)    
    text = re.sub(r'\s+O$', '', text)   

    text = re.sub(r'[\u1361-\u1368]', ' ', text) # Remove Ethiopian punctuation marks if desired
    
    return text.strip()

if 'Message' in df_scraped.columns:
    df_scraped['Cleaned_Message'] = df_scraped['Message'].fillna('').apply(clean_amharic_text)
    print("\nCleaned 'Message' column and created 'Cleaned_Message'.")
    print("\nSample of cleaned messages:")
    for i, row in df_scraped.sample(min(5, len(df_scraped))).iterrows():
        print(f"Original: {row['Message']}")
        print(f"Cleaned:  {row['Cleaned_Message']}\n")
else:
    print("Error: 'Message' column not found in scraped data DataFrame. Cannot perform cleaning.")

In [None]:
labeled_data_path = '../labeled_telegram_product_price_location.txt'

def parse_conll_file(filepath):
    sentences = []
    current_sentence = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line: 
                parts = line.split(' ')
                token = ' '.join(parts[:-1]) 
                label = parts[-1]
                current_sentence.append((token, label))
            else: 
                if current_sentence:
                    sentences.append(current_sentence)
                current_sentence = []
        if current_sentence:
            sentences.append(current_sentence)
    return sentences

try:
    labeled_sentences = parse_conll_file(labeled_data_path)
    print(f"\nLoaded labeled data from: {labeled_data_path}")
    print(f"Number of labeled sentences: {len(labeled_sentences)}")
    print("\nFirst labeled sentence example (token, label):")
    if labeled_sentences:
        for token, label in labeled_sentences[0]:
            print(f"  {token}\t{label}")
    else:
        print("No sentences found in labeled data.")
except FileNotFoundError:
    print(f"Error: The labeled data file '{labeled_data_path}' was not found.")
    print("Please ensure 'labeled_telegram_product_price_location.txt' is in your project root.")
except Exception as e:
    print(f"An error occurred while parsing the labeled CoNLL file: {e}")

In [None]:
processed_scraped_csv_path = '../data/processed/cleaned_telegram_data.csv'
os.makedirs(os.path.dirname(processed_scraped_csv_path), exist_ok=True)
if 'df_scraped' in locals() and 'Cleaned_Message' in df_scraped.columns:
    df_scraped.to_csv(processed_scraped_csv_path, index=False, encoding='utf-8')
    print(f"\nCleaned scraped data saved to: {processed_scraped_csv_path}")


print("\nLabeled data is parsed and ready in 'labeled_sentences' variable.")
print("This parsed labeled data ('labeled_sentences') will be directly used in Task 3 for fine-tuning the NER model.")
