In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer
from transformers import XLMRobertaTokenizer, XLMRobertaModel

# Load AfroXLMR tokenizer (supports Amharic)
#tokenizer = AutoTokenizer.from_pretrained("Davlan/afro-xlmr-bert-base")
#from transformers import AutoTokenizer

# Use a valid model
model_name = "rasyosef/roberta-base-amharic"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load tokenizer and model
# Input file from the previous step
input_csv = '../data/telegram_with_images.csv'
output_csv = '../data/preprocessed_telegram_data3.csv'

# Amharic text cleaner
def clean_amharic_text(text):
    if not isinstance(text, str):
        return ""
    # Remove emojis and non-Amharic characters
    text = re.sub(r"[^\u1200-\u137F0-9፩-፷.,!?@/\s]", "", text)
    # Normalize spaces and punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Tokenize text using AfroXLMR tokenizer
def tokenize_amharic_text(text):
    return tokenizer.tokenize(text)

# Load the CSV
df = pd.read_csv(input_csv)

# Clean and tokenize
df['clean_text'] = df['text'].apply(clean_amharic_text)
df['tokens'] = df['clean_text'].apply(tokenize_amharic_text)

# Keep only relevant structured columns
structured_df = df[['channel', 'timestamp', 'clean_text', 'tokens', 'image_path']]

# Save structured data
structured_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"✅ Preprocessed and saved structured data to: {output_csv}")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


✅ Preprocessed and saved structured data to: ../data/preprocessed_telegram_data4.csv


In [2]:

# Load your cleaned + tokenized data
df = pd.read_csv("../data/preprocessed_telegram_data.csv")

# Sample 30 messages
sampled_df = df.sample(n=30, random_state=42)

# Output file in CoNLL format
output_path = "amharic_ner_sample.conll"

# Write to CoNLL format
with open(output_path, 'w', encoding='utf-8') as f:
    for _, row in sampled_df.iterrows():
        tokens = eval(row['tokens'])  # stored as string in CSV
        for token in tokens:
            f.write(f"{token} O\n")  # Default label "O" (to be manually changed)
        f.write("\n")  # Blank line to separate messages

print(f"✅ Sample file created: {output_path} (30 messages, ready for labeling)")


✅ Sample file created: amharic_ner_sample.conll (30 messages, ready for labeling)
