In [None]:
# Notebook 2: Data Labeling (CoNLL Format)
# ========================================

# Install necessary packages
!pip install pandas

import pandas as pd
import os

# ==============================
# 1. Load Processed Data
# ==============================
df = pd.read_csv("data/processed/messages_snapshot.csv")
print(f"Total messages: {len(df)}")
print(df.head(5))

# ==============================
# 2. Sample Messages for Labeling
# ==============================
# You can label at least 30-50 messages manually for initial training
sample_messages = df["clean_text"].sample(50, random_state=42).tolist()

# ==============================
# 3. CoNLL Format Template
# ==============================
# Example entity types:
# B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O

def generate_conll(message, entities={}):
    """
    Convert a message to CoNLL format.
    entities: dict {token_index: label} e.g., {0: "B-Product", 1: "I-Product"}
    """
    tokens = message.split()
    conll_lines = []
    for idx, token in enumerate(tokens):
        label = entities.get(idx, "O")
        conll_lines.append(f"{token} {label}")
    conll_lines.append("")  # blank line to separate messages
    return "\n".join(conll_lines)

# ==============================
# 4. Example Manual Labeling
# ==============================
# Example message: "ሙሉ ቁርስ በ200 ብር በአዲስ አበባ"
message = "ሙሉ ቁርስ በ200 ብር በአዲስ አበባ"
# Token index mapping for labels
entities = {0: "B-Product", 1: "I-Product", 3: "B-PRICE", 4: "I-PRICE", 6: "B-LOC", 7: "I-LOC"}

conll_output = generate_conll(message, entities)
print(conll_output)

# ==============================
# 5. Export Labeled Messages to CoNLL File
# ==============================
os.makedirs("data/labeled", exist_ok=True)
with open("data/labeled/ner_train.conll", "w", encoding="utf-8") as f:
    for msg in sample_messages:
        # Initially label everything as "O" (to be corrected manually)
        f.write(generate_conll(msg))
