In [4]:
# %% [markdown]
"""
# Amharic Text NER Annotation (CoNLL Format)
### Notebook for processing product listings with entity recognition

**File Structure:**
- Data: `../data/preprocessed_data.csv`
- Script: `../scripts/ner_conll.py`
- Output: `../data/annotations/`
"""

# %%
# === SETUP AND CONFIGURATION ===
#%pip install ipywidgets tqdm pandas
import sys
import os
from pathlib import Path
import importlib.util
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display, HTML

# Set up paths
current_dir = Path.cwd()
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
scripts_dir = project_root / "scripts"
data_dir = project_root / "data"

# Add to Python path
if str(scripts_dir) not in sys.path:
    sys.path.insert(0, str(scripts_dir))

# Verify paths
print("PROJECT PATHS:")
print(f"Root: {project_root}")
print(f"Scripts: {scripts_dir}")
print(f"Data: {data_dir}\n")

# %%
# === MODULE IMPORT ===
try:
    from ner_conll import CoNLLAnnotator, load_data, save_annotations
    print("✅ Successfully imported ner_conll module")
except ImportError as e:
    print(f"❌ Import failed: {e}")
    
    # Fallback: Direct import
    print("Attempting direct import...")
    try:
        spec = importlib.util.spec_from_file_location(
            "ner_conll", 
            scripts_dir / "ner_conll.py"
        )
        ner_conll = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(ner_conll)
        from ner_conll import CoNLLAnnotator, load_data, save_annotations
        print("✅ Fallback import successful")
    except Exception as e:
        print(f"❌ Failed to import module: {e}")
        raise

# %%
# === DATA LOADING ===
data_path = data_dir / "preprocessed_data.csv"

try:
    df = load_data(data_path)
    print(f"✅ Loaded {len(df)} records from {data_path}")
    if len(df) > 0:
        display(df[['message_id', 'Message']].head(3))
    else:
        print("⚠️ Warning: Loaded empty DataFrame")
except Exception as e:
    print(f"❌ Failed to load data: {e}")
    print("Available data files:")
    available_files = list(data_dir.glob("*.csv"))
    if available_files:
        print("\n".join(str(f) for f in available_files))
    else:
        print("No CSV files found in data directory")
    raise

# %%
# === INITIALIZE ANNOTATOR ===
try:
    annotator = CoNLLAnnotator()
    print("✅ Annotator initialized successfully")
    
    # Test annotation
    if len(df) > 0:
        sample_text = df.iloc[0]['Message']
        test_annotation = annotator.annotate(sample_text)
        print("\nSAMPLE ANNOTATION (first 10 tokens):")
        print(test_annotation[:10])
    else:
        print("⚠️ No data available for test annotation")
except Exception as e:
    print(f"❌ Failed to initialize annotator: {e}")
    raise

# %%
# === BATCH PROCESSING ===
# Configuration
SAMPLE_SIZE = min(50, len(df)) if len(df) > 0 else 0  # Handle empty DataFrame
OUTPUT_DIR = data_dir / "annotations"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

if SAMPLE_SIZE == 0:
    print("⚠️ No data available for processing")
else:
    # Select data
    sample_df = df.head(SAMPLE_SIZE).copy()
    print(f"\nProcessing {len(sample_df)} messages...")

    # Process with progress bar
    results = []
    failed_ids = []
    stats = {
        'total_tokens': 0,
        'products': 0,
        'prices': 0,
        'locations': 0
    }

    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        try:
            annotated = annotator.annotate(row['Message'])
            conll_text = annotator.to_conll(annotated)
            
            # Update stats
            stats['total_tokens'] += len(annotated)
            for _, label in annotated:
                if label.startswith('B-'):
                    entity_type = label.split('-')[1].lower()
                    if f"{entity_type}s" in stats:
                        stats[f"{entity_type}s"] += 1
            
            results.append({
                'message_id': row['message_id'],
                'original': row['Message'],
                'annotated': annotated,
                'conll': conll_text
            })
        except Exception as e:
            failed_ids.append(row['message_id'])
            print(f"\nError processing message {row['message_id']}: {str(e)}")

    # Processing summary
    print("\nPROCESSING SUMMARY:")
    print(f"- Total messages: {len(sample_df)}")
    print(f"- Successfully processed: {len(results)}")
    print(f"- Failed: {len(failed_ids)}")
    if failed_ids:
        print(f"- Failed message IDs: {failed_ids[:5]}")  # Show first 5 failures

# %%
# === STATISTICS AND VISUALIZATION ===
if len(results) > 0:
    print("\nPROCESSING STATISTICS:")
    print(f"- Total tokens: {stats['total_tokens']}")
    print(f"- Product entities: {stats['products']}")
    print(f"- Price entities: {stats['prices']}")
    print(f"- Location entities: {stats['locations']}")

    # Visualization function
    def show_annotation(annotation):
        html = []
        colors = {
            'B-PRODUCT': '#a6dba0',
            'I-PRODUCT': '#c2e6bd',
            'B-PRICE': '#f4a582',
            'I-PRICE': '#fddbc7',
            'B-LOC': '#92c5de',
            'I-LOC': '#d1e5f0'
        }
        
        for token, label in annotation:
            if label != 'O':
                html.append(f'<span style="background-color:{colors[label]}" title="{label}">{token}</span>')
            else:
                html.append(token)
        
        display(HTML(' '.join(html) + '<br><br>' + 
              '<small>Legend: ' + 
              ' '.join([f'<span style="background-color:{color}">{label}</span>' 
                        for label, color in colors.items()]) + 
              '</small>'))

    # Show multiple samples
    def show_samples(results, num_samples=3):
        print(f"\nSHOWING {min(num_samples, len(results))} SAMPLE ANNOTATIONS:")
        for i in range(min(num_samples, len(results))):
            print(f"\nSample {i+1} (Message ID: {results[i]['message_id']}):")
            show_annotation(results[i]['annotated'])

    show_samples(results)
else:
    print("\n⚠️ No results available for visualization")

# %%
# === SAVE OUTPUTS ===
# %%
# === SAVE OUTPUTS ===
if len(results) > 0:
    # Save CONLL file with updated path and filename
    conll_path = data_dir / "clean_processed.conll"  # Changed path and filename
    try:
        with open(conll_path, 'w', encoding='utf-8') as f:
            for result in results:
                f.write(result['conll'] + '\n\n')
        print(f"\n✅ Saved CONLL annotations to {conll_path}")
    except Exception as e:
        print(f"❌ Failed to save CONLL file: {e}")
        # Create error log with failed entries
        error_log_path = data_dir / "conll_errors.log"
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(f"Error saving {conll_path}: {e}\n")
            f.write("Failed entries:\n")
            for result in results:
                try:
                    f.write(result['conll'] + '\n\n')
                except:
                    f.write(f"Could not write entry {result.get('message_id', 'unknown')}\n")
        print(f"⚠️ Error log saved to {error_log_path}")

    # Rest of your saving code remains the same...
    # Save CSV report
    try:
        report_df = pd.DataFrame([{
            'message_id': r['message_id'],
            'product_entities': sum(1 for _, label in r['annotated'] if 'PRODUCT' in label),
            'price_entities': sum(1 for _, label in r['annotated'] if 'PRICE' in label),
            'location_entities': sum(1 for _, label in r['annotated'] if 'LOC' in label)
        } for r in results])

        csv_path = OUTPUT_DIR / "annotation_report.csv"
        report_df.to_csv(csv_path, index=False)
        print(f"✅ Saved annotation report to {csv_path}")
        
        # Error analysis
        print("\nENTITY DISTRIBUTION:")
        print(report_df.describe())

        # Find messages with no entities
        zero_entity_msgs = report_df[
            (report_df['product_entities'] == 0) & 
            (report_df['price_entities'] == 0) & 
            (report_df['location_entities'] == 0)
        ]
        
        if not zero_entity_msgs.empty:
            print(f"\n⚠️ Found {len(zero_entity_msgs)} messages with no entities:")
            display(sample_df[sample_df['message_id'].isin(zero_entity_msgs['message_id'])])
    except Exception as e:
        print(f"❌ Failed to save report: {e}")
else:
    print("\n⚠️ No results to save")

PROJECT PATHS:
Root: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce
Scripts: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\scripts
Data: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data

✅ Successfully imported ner_conll module
✅ Loaded 796 records from d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\preprocessed_data.csv


Unnamed: 0,message_id,Message
0,6991,💥💥...................................💥💥\n\n📌Sa...
1,6987,💥💥...................................💥💥\n\n3pc...
2,6986,💥💥...................................💥💥\n\n3pc...


✅ Annotator initialized successfully

SAMPLE ANNOTATION (first 10 tokens):
[('Saachi', 'O'), ('Electric', 'O'), ('Kettle', 'O'), ('Borosilicate', 'O'), ('Glass', 'O'), ('Body', 'O'), ('Overheat', 'O'), ('protection', 'O'), ('Automatic', 'O'), ('switch', 'O')]

Processing 50 messages...


  0%|          | 0/50 [00:00<?, ?it/s]


PROCESSING SUMMARY:
- Total messages: 50
- Successfully processed: 50
- Failed: 0

PROCESSING STATISTICS:
- Total tokens: 2667
- Product entities: 26
- Price entities: 96
- Location entities: 0

SHOWING 3 SAMPLE ANNOTATIONS:

Sample 1 (Message ID: 6991):



Sample 2 (Message ID: 6987):



Sample 3 (Message ID: 6986):



✅ Saved CONLL annotations to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\clean_processed.conll
✅ Saved annotation report to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\annotations\annotation_report.csv

ENTITY DISTRIBUTION:
        message_id  product_entities  price_entities  location_entities
count    50.000000         50.000000       50.000000               50.0
mean   6925.600000          1.240000        4.500000                3.0
std      40.807662          6.136409        1.035098                0.0
min    6861.000000          0.000000        0.000000                3.0
25%    6893.000000          0.000000        4.000000                3.0
50%    6926.000000          0.000000        5.000000                3.0
75%    6963.750000          0.000000        5.000000                3.0
max    6991.000000         31.000000        5.000000                3.0
