In [4]:
# %% [markdown]
"""
# Amharic Text NER Annotation (CoNLL Format)
### Notebook for processing product listings with entity recognition

**File Structure:**
- Data: `../data/preprocessed_data.csv`
- Script: `../scripts/ner_conll.py`
- Output: `../data/annotations/`
"""

# %%
# === SETUP AND CONFIGURATION ===
#%pip install ipywidgets tqdm pandas
import sys
import os
from pathlib import Path
import importlib.util
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display, HTML

# Set up paths
current_dir = Path.cwd()
project_root = current_dir.parent if current_dir.name == "notebooks" else current_dir
scripts_dir = project_root / "scripts"
data_dir = project_root / "data"

# Add to Python path
if str(scripts_dir) not in sys.path:
    sys.path.insert(0, str(scripts_dir))

# Verify paths
print("PROJECT PATHS:")
print(f"Root: {project_root}")
print(f"Scripts: {scripts_dir}")
print(f"Data: {data_dir}\n")

# %%
# === MODULE IMPORT ===
try:
    from ner_conll import CoNLLAnnotator, load_data, save_annotations
    print("✅ Successfully imported ner_conll module")
except ImportError as e:
    print(f"❌ Import failed: {e}")
    
    # Fallback: Direct import
    print("Attempting direct import...")
    try:
        spec = importlib.util.spec_from_file_location(
            "ner_conll", 
            scripts_dir / "ner_conll.py"
        )
        ner_conll = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(ner_conll)
        from ner_conll import CoNLLAnnotator, load_data, save_annotations
        print("✅ Fallback import successful")
    except Exception as e:
        print(f"❌ Failed to import module: {e}")
        raise

# %%
# === DATA LOADING ===
data_path = data_dir / "preprocessed_data.csv"

try:
    df = load_data(data_path)
    print(f"✅ Loaded {len(df)} records from {data_path}")
    if len(df) > 0:
        display(df[['message_id', 'Message']].head(3))
    else:
        print("⚠️ Warning: Loaded empty DataFrame")
except Exception as e:
    print(f"❌ Failed to load data: {e}")
    print("Available data files:")
    available_files = list(data_dir.glob("*.csv"))
    if available_files:
        print("\n".join(str(f) for f in available_files))
    else:
        print("No CSV files found in data directory")
    raise

# %%
# === INITIALIZE ANNOTATOR ===
try:
    annotator = CoNLLAnnotator()
    print("✅ Annotator initialized successfully")
    
    # Test annotation
    if len(df) > 0:
        sample_text = df.iloc[0]['Message']
        test_annotation = annotator.annotate(sample_text)
        print("\nSAMPLE ANNOTATION (first 10 tokens):")
        print(test_annotation[:10])
    else:
        print("⚠️ No data available for test annotation")
except Exception as e:
    print(f"❌ Failed to initialize annotator: {e}")
    raise

# %%
# === BATCH PROCESSING ===
# Configuration
SAMPLE_SIZE = min(50, len(df)) if len(df) > 0 else 0  # Handle empty DataFrame
OUTPUT_DIR = data_dir / "annotations"
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

if SAMPLE_SIZE == 0:
    print("⚠️ No data available for processing")
else:
    # Select data
    sample_df = df.head(SAMPLE_SIZE).copy()
    print(f"\nProcessing {len(sample_df)} messages...")

    # Process with progress bar
    results = []
    failed_ids = []
    stats = {
        'total_tokens': 0,
        'products': 0,
        'prices': 0,
        'locations': 0
    }

    for idx, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
        try:
            annotated = annotator.annotate(row['Message'])
            conll_text = annotator.to_conll(annotated)
            
            # Update stats
            stats['total_tokens'] += len(annotated)
            for _, label in annotated:
                if label.startswith('B-'):
                    entity_type = label.split('-')[1].lower()
                    if f"{entity_type}s" in stats:
                        stats[f"{entity_type}s"] += 1
            
            results.append({
                'message_id': row['message_id'],
                'original': row['Message'],
                'annotated': annotated,
                'conll': conll_text
            })
        except Exception as e:
            failed_ids.append(row['message_id'])
            print(f"\nError processing message {row['message_id']}: {str(e)}")

    # Processing summary
    print("\nPROCESSING SUMMARY:")
    print(f"- Total messages: {len(sample_df)}")
    print(f"- Successfully processed: {len(results)}")
    print(f"- Failed: {len(failed_ids)}")
    if failed_ids:
        print(f"- Failed message IDs: {failed_ids[:5]}")  # Show first 5 failures

# %%
# === STATISTICS AND VISUALIZATION ===
if len(results) > 0:
    print("\nPROCESSING STATISTICS:")
    print(f"- Total tokens: {stats['total_tokens']}")
    print(f"- Product entities: {stats['products']}")
    print(f"- Price entities: {stats['prices']}")
    print(f"- Location entities: {stats['locations']}")

    # Visualization function
    def show_annotation(annotation):
        html = []
        colors = {
            'B-PRODUCT': '#a6dba0',
            'I-PRODUCT': '#c2e6bd',
            'B-PRICE': '#f4a582',
            'I-PRICE': '#fddbc7',
            'B-LOC': '#92c5de',
            'I-LOC': '#d1e5f0'
        }
        
        for token, label in annotation:
            if label != 'O':
                html.append(f'<span style="background-color:{colors[label]}" title="{label}">{token}</span>')
            else:
                html.append(token)
        
        display(HTML(' '.join(html) + '<br><br>' + 
              '<small>Legend: ' + 
              ' '.join([f'<span style="background-color:{color}">{label}</span>' 
                        for label, color in colors.items()]) + 
              '</small>'))

    # Show multiple samples
    def show_samples(results, num_samples=3):
        print(f"\nSHOWING {min(num_samples, len(results))} SAMPLE ANNOTATIONS:")
        for i in range(min(num_samples, len(results))):
            print(f"\nSample {i+1} (Message ID: {results[i]['message_id']}):")
            show_annotation(results[i]['annotated'])

    show_samples(results)
else:
    print("\n⚠️ No results available for visualization")

# %%
# === SAVE OUTPUTS ===
# %%
# === SAVE OUTPUTS ===
if len(results) > 0:
    # Save CONLL file with updated path and filename
    conll_path = data_dir / "clean_processed.conll"  # Changed path and filename
    try:
        with open(conll_path, 'w', encoding='utf-8') as f:
            for result in results:
                f.write(result['conll'] + '\n\n')
        print(f"\n✅ Saved CONLL annotations to {conll_path}")
    except Exception as e:
        print(f"❌ Failed to save CONLL file: {e}")
        # Create error log with failed entries
        error_log_path = data_dir / "conll_errors.log"
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(f"Error saving {conll_path}: {e}\n")
            f.write("Failed entries:\n")
            for result in results:
                try:
                    f.write(result['conll'] + '\n\n')
                except:
                    f.write(f"Could not write entry {result.get('message_id', 'unknown')}\n")
        print(f"⚠️ Error log saved to {error_log_path}")

    # Rest of your saving code remains the same...
    # Save CSV report
    try:
        report_df = pd.DataFrame([{
            'message_id': r['message_id'],
            'product_entities': sum(1 for _, label in r['annotated'] if 'PRODUCT' in label),
            'price_entities': sum(1 for _, label in r['annotated'] if 'PRICE' in label),
            'location_entities': sum(1 for _, label in r['annotated'] if 'LOC' in label)
        } for r in results])

        csv_path = OUTPUT_DIR / "annotation_report.csv"
        report_df.to_csv(csv_path, index=False)
        print(f"✅ Saved annotation report to {csv_path}")
        
        # Error analysis
        print("\nENTITY DISTRIBUTION:")
        print(report_df.describe())

        # Find messages with no entities
        zero_entity_msgs = report_df[
            (report_df['product_entities'] == 0) & 
            (report_df['price_entities'] == 0) & 
            (report_df['location_entities'] == 0)
        ]
        
        if not zero_entity_msgs.empty:
            print(f"\n⚠️ Found {len(zero_entity_msgs)} messages with no entities:")
            display(sample_df[sample_df['message_id'].isin(zero_entity_msgs['message_id'])])
    except Exception as e:
        print(f"❌ Failed to save report: {e}")
else:
    print("\n⚠️ No results to save")

PROJECT PATHS:
Root: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce
Scripts: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\scripts
Data: d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data

✅ Successfully imported ner_conll module
✅ Loaded 796 records from d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\preprocessed_data.csv


Unnamed: 0,message_id,Message
0,6991,💥💥...................................💥💥\n\n📌Sa...
1,6987,💥💥...................................💥💥\n\n3pc...
2,6986,💥💥...................................💥💥\n\n3pc...


✅ Annotator initialized successfully

SAMPLE ANNOTATION (first 10 tokens):
[('Saachi', 'O'), ('Electric', 'O'), ('Kettle', 'O'), ('Borosilicate', 'O'), ('Glass', 'O'), ('Body', 'O'), ('Overheat', 'O'), ('protection', 'O'), ('Automatic', 'O'), ('switch', 'O')]

Processing 50 messages...


  0%|          | 0/50 [00:00<?, ?it/s]


PROCESSING SUMMARY:
- Total messages: 50
- Successfully processed: 50
- Failed: 0

PROCESSING STATISTICS:
- Total tokens: 2667
- Product entities: 26
- Price entities: 96
- Location entities: 0

SHOWING 3 SAMPLE ANNOTATIONS:

Sample 1 (Message ID: 6991):



Sample 2 (Message ID: 6987):



Sample 3 (Message ID: 6986):



✅ Saved CONLL annotations to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\clean_processed.conll
✅ Saved annotation report to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\annotations\annotation_report.csv

ENTITY DISTRIBUTION:
        message_id  product_entities  price_entities  location_entities
count    50.000000         50.000000       50.000000               50.0
mean   6925.600000          1.240000        4.500000                3.0
std      40.807662          6.136409        1.035098                0.0
min    6861.000000          0.000000        0.000000                3.0
25%    6893.000000          0.000000        4.000000                3.0
50%    6926.000000          0.000000        5.000000                3.0
75%    6963.750000          0.000000        5.000000                3.0
max    6991.000000         31.000000        5.000000                3.0


In [6]:
df = pd.read_csv('../data/preprocessed_data.csv')


In [7]:
df

Unnamed: 0,channel_name,message_id,timestamp,Message,views,has_image,text_cleaned
0,ZemenExpress,6991,2025-06-21 16:35:51+00:00,💥💥...................................💥💥\n\n📌Sa...,1272,True,................................... Saachi Ele...
1,ZemenExpress,6987,2025-06-21 08:07:31+00:00,💥💥...................................💥💥\n\n3pc...,2100,True,................................... 3pcs Bottl...
2,ZemenExpress,6986,2025-06-21 08:07:11+00:00,💥💥...................................💥💥\n\n3pc...,1765,False,................................... 3pcs Bottl...
3,ZemenExpress,6985,2025-06-21 05:42:46+00:00,💥💥...................................💥💥\n\n📌1 ...,1958,True,................................... 1 pairs Sn...
4,ZemenExpress,6983,2025-06-21 05:42:19+00:00,💥💥...................................💥💥\n\n📌1 ...,2094,True,................................... 1 pairs Sn...
...,...,...,...,...,...,...,...
791,modernshoppingcenter,6361,2025-05-22 10:05:57+00:00,**ቴሌግራም****🫐**** **t.me/modernshoppingcenter**...,16083,False,"**ቴሌግራም****🫐**** ** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" **..."
792,modernshoppingcenter,6358,2025-05-22 06:05:40+00:00,**ቴሌግራም****⭐️**** ****t.me/modernshoppingcente...,14663,True,"**ቴሌግራም****⭐**** **** ""በአዲስ ነገር ሁሌም ቀዳሚዎች ነን"" ..."
793,modernshoppingcenter,6354,2025-05-21 16:04:32+00:00,**ቴሌግራም****⭐️**** ****t.me/modernshoppingcente...,15947,True,"**ቴሌግራም****⭐**** **** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" ..."
794,modernshoppingcenter,6352,2025-05-21 10:04:59+00:00,**ቴሌግራም****🍀**** ****t.me/modernshoppingcenter...,16289,True,"**ቴሌግራም****🍀**** **** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" ..."


In [8]:
import re

def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026FF"  # Misc symbols
        "\U00002B50-\U00002B55"
        "\U0000231A-\U0000231B"
        "\U0001FA70-\U0001FAFF"
        "\U0001F700-\U0001F77F"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Remove emojis from 'Message' and 'text_cleaned' columns
df['Message_no_emoji'] = df['Message'].apply(remove_emojis)
df['text_cleaned_no_emoji'] = df['text_cleaned'].apply(remove_emojis)

# Show a sample
df[['Message', 'Message_no_emoji', 'text_cleaned', 'text_cleaned_no_emoji']].head()

Unnamed: 0,Message,Message_no_emoji,text_cleaned,text_cleaned_no_emoji
0,💥💥...................................💥💥\n\n📌Sa...,...................................\n\nSaachi ...,................................... Saachi Ele...,................................... Saachi Ele...
1,💥💥...................................💥💥\n\n3pc...,...................................\n\n3pcs Bo...,................................... 3pcs Bottl...,................................... 3pcs Bottl...
2,💥💥...................................💥💥\n\n3pc...,...................................\n\n3pcs Bo...,................................... 3pcs Bottl...,................................... 3pcs Bottl...
3,💥💥...................................💥💥\n\n📌1 ...,...................................\n\n1 pairs...,................................... 1 pairs Sn...,................................... 1 pairs Sn...
4,💥💥...................................💥💥\n\n📌1 ...,...................................\n\n1 pairs...,................................... 1 pairs Sn...,................................... 1 pairs Sn...


In [9]:
# Save the cleaned text to CSV
df[['text_cleaned']].to_csv(data_dir / 'clean_data.csv', index=False, encoding='utf-8')
print(f"✅ Saved cleaned text to {data_dir / 'clean_data.csv'}")

# Manual annotation for CoNLL format (example for 30 messages)
# We'll use the existing annotation logic if available, otherwise, a placeholder for manual annotation

# If you have an annotator (as in previous cells), use it:
if 'annotator' in globals():
    sample_messages = df.head(50)  # Select first 50 messages
    conll_lines = []
    for _, row in sample_messages.iterrows():
        annotated = annotator.annotate(row['Message'])
        for token, label in annotated:
            conll_lines.append(f"{token}\t{label}")
        conll_lines.append("")  # Blank line between messages

    # Save to file
    conll_path = data_dir / "manual_ner_sample.conll"
    with open(conll_path, 'w', encoding='utf-8') as f:
        f.write('\n'.join(conll_lines))
    print(f"✅ Saved CoNLL NER sample to {conll_path}")
else:
    print("❌ No annotator found. Please annotate manually or define an annotator.")

✅ Saved cleaned text to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\clean_data.csv
✅ Saved CoNLL NER sample to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\manual_ner_sample.conll


In [10]:
# Label 30-50 messages from the "Message" column in CoNLL format using the annotator

# Select 50 messages for annotation (or fewer if not enough data)
num_samples = min(50, len(df))
sample_messages = df.head(num_samples)

conll_lines = []

for _, row in sample_messages.iterrows():
    # Use the annotator to label entities in the message
    annotated = annotator.annotate(row['Message'])
    for token, label in annotated:
        conll_lines.append(f"{token}\t{label}")
    conll_lines.append("")  # Blank line between messages

# Save the annotated data in CoNLL format
conll_output_path = data_dir / "ner_labeled_sample.conll"
with open(conll_output_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(conll_lines))

print(f"✅ Labeled {num_samples} messages and saved to {conll_output_path}")

✅ Labeled 50 messages and saved to d:\Kifiya AI Master Training Program 5 6 &7\week-4\EthioMart-Amharic-E-commerce\data\ner_labeled_sample.conll


In [13]:
# Display the contents of the clean_processed.conll file
conll_path = data_dir / "clean_processed.conll"
with open(conll_path, "r", encoding="utf-8") as f:
    content = f.read()
print(content[:2000])  # Display the first 2000 characters for readability

Saachi	O
Electric	O
Kettle	O
Borosilicate	O
Glass	O
Body	O
Overheat	O
protection	O
Automatic	O
switch	O
off	B-PRICE
2200w	I-PRICE
ዋጋ	B-PRICE
2700	I-PRICE
ብር	I-PRICE
ውስን	O
ፍሬ	O
ነው	O
ያለው	O
አድራሻ	B-LOC
መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ	I-LOC
ቢሮ	I-LOC
ቁ	O
S05	O
S06	O
0902660722	O
0928460606	O
በTelegram	O
ለማዘዝ	O
ይጠቀሙ	O
zemencallcenter	O
zemenexpressadmin	O
ለተጨማሪ	O
ማብራሪያ	O
የቴሌግራም	O
ገፃችን	O
https	O
telegram	O
me	O
zemenexpress	O

3pcs	O
Bottle	O
Stopper	O
በማንኛውም	O
ጠርሙስ	O
ጫፍ	O
የሚገጠም	O
ለዘይት	O
እና	O
መሰል	O
ነገሮች	O
መቀነሻ	O
የሚሆን	O
በአግባቡ	O
እየመጠንን	O
ለመጠቀም	B-PRICE
ተመራጭ	I-PRICE
ዋጋ	B-PRICE
3ፍሬ	I-PRICE
400	O
ብር	O
ውስን	O
ፍሬ	O
ነው	O
ያለው	O
አድራሻ	B-LOC
መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ	I-LOC
ቢሮ	I-LOC
ቁ	O
S05	O
S06	O
0902660722	O
0928460606	O
በTelegram	O
ለማዘዝ	O
ይጠቀሙ	O
zemencallcenter	O
zemenexpressadmin	O
ለተጨማሪ	O
ማብራሪያ	O
የቴሌግራም	O
ገፃችን	O
https	O
telegram	O
me	O
zemenexpress	O

3pcs	O
Bottle	O
Stopper	O
በማንኛውም	O
ጠርሙስ	O
ጫፍ	O
የሚገጠም	O
ለዘይት	O
እና	O
መሰል	O
ነገሮች	O
መቀነሻ	O
የሚሆን	O
በአግባቡ	O
እየመጠንን	O
ለመጠቀም	B-PRICE
ተመራጭ	I-PRICE
ዋጋ	B-PRICE
3ፍሬ	I-PRICE


In [15]:
Message_list = df['Message'].tolist()
Message_list

['💥💥...................................💥💥\n\n📌Saachi Electric Kettle\n\n👍Borosilicate Glass Body\n👍Overheat protection\n👍Automatic switch off\n👍2200w\n\nዋጋ፦\xa0 💲🏷 2700\xa0 ብር ✅\n\n♦️ውስን ፍሬ ነው ያለው🔥🔥🔥\n\n🏢 አድራሻ👉\n\n📍♦️#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ. S05/S06\n\n\n\xa0\xa0\xa0\xa0 💧💧💧💧\n\n\n\xa0\xa0\xa0 📲 0902660722\n\xa0\xa0\xa0 📲 0928460606 \n\n🔖\n💬በTelegram ለማዘዝ ⤵️ ይጠቀሙ🔽\n\n@zemencallcenter \n\n@zemenexpressadmin\n\nለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️\nhttps://telegram.me/zemenexpress\n\n💥💥...................................💥💥',
 '💥💥...................................💥💥\n\n3pcs Bottle Stopper \n\n👍በማንኛውም ጠርሙስ ጫፍ የሚገጠም \n👍ለዘይት እና መሰል ነገሮች መቀነሻ የሚሆን\n👍በአግባቡ እየመጠንን ለመጠቀም ተመራጭ\n\nዋጋ፦\xa0 💲🏷\xa0 3ፍሬ➡️ 400 ብር\n\n♦️ውስን ፍሬ ነው ያለው 🔥🔥🔥\n\n🏢 አድራሻ👉\n\n📍♦️#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ. S05/S06\n\n\n\xa0\xa0\xa0\xa0 💧💧💧💧\n\n\n\xa0\xa0\xa0 📲 0902660722\n\xa0\xa0\xa0 📲 0928460606 \n\n🔖\n💬በTelegram ለማዘዝ ⤵️ ይጠቀሙ🔽\n\n@zemencallcenter \n\n@zemenexpressadmin\n\nለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵️\nhttps://telegram.me/zemenex

In [18]:
Message_list = df['text_cleaned_no_emoji'].tolist()
Message_list

['................................... Saachi Electric Kettle Borosilicate Glass Body Overheat protection Automatic switch off 2200w ዋጋ፦  2700 ብር ውስን ፍሬ ነው ያለው  አድራሻ  ቢሮ ቁ. S05/S06   0902660722  0928460606  በTelegram ለማዘዝ ⤵ ይጠቀሙ ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................',
 '................................... 3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት እና መሰል ነገሮች መቀነሻ የሚሆን በአግባቡ እየመጠንን ለመጠቀም ተመራጭ ዋጋ፦  3ፍሬ 400 ብር ውስን ፍሬ ነው ያለው  አድራሻ  ቢሮ ቁ. S05/S06   0902660722  0928460606  በTelegram ለማዘዝ ⤵ ይጠቀሙ ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................',
 '................................... 3pcs Bottle Stopper በማንኛውም ጠርሙስ ጫፍ የሚገጠም ለዘይት እና መሰል ነገሮች መቀነሻ የሚሆን በአግባቡ እየመጠንን ለመጠቀም ተመራጭ ዋጋ፦  3ፍሬ 400 ብር ውስን ፍሬ ነው ያለው  አድራሻ  ቢሮ ቁ. S05/S06   0902660722  0928460606  በTelegram ለማዘዝ ⤵ ይጠቀሙ ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................',
 '................................... 1 pairs Sneaker Crease Protector ዋጋ፦  400 ብር ውስን ፍሬ ነው ያለው  አድራሻ  ቢሮ ቁ. S05/S06   09026

In [19]:
# Load the cleaned data and display the 'Message_list'
clean_data_path = data_dir / 'clean_data.csv'
clean_df = pd.read_csv(clean_data_path, encoding='utf-8')
Message_list = clean_df['text_cleaned'].tolist()
print(Message_list)

['................................... Saachi Electric Kettle 👍Borosilicate Glass Body 👍Overheat protection 👍Automatic switch off 👍2200w ዋጋ፦ 💲🏷 2700 ብር ♦ውስን ፍሬ ነው ያለው 🏢 አድራሻ ♦ ቢሮ ቁ. S05/S06 💧💧💧💧 📲 0902660722 📲 0928460606 🔖 💬በTelegram ለማዘዝ ⤵ ይጠቀሙ🔽 ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................', '................................... 3pcs Bottle Stopper 👍በማንኛውም ጠርሙስ ጫፍ የሚገጠም 👍ለዘይት እና መሰል ነገሮች መቀነሻ የሚሆን 👍በአግባቡ እየመጠንን ለመጠቀም ተመራጭ ዋጋ፦ 💲🏷 3ፍሬ➡ 400 ብር ♦ውስን ፍሬ ነው ያለው 🏢 አድራሻ ♦ ቢሮ ቁ. S05/S06 💧💧💧💧 📲 0902660722 📲 0928460606 🔖 💬በTelegram ለማዘዝ ⤵ ይጠቀሙ🔽 ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................', '................................... 3pcs Bottle Stopper 👍በማንኛውም ጠርሙስ ጫፍ የሚገጠም 👍ለዘይት እና መሰል ነገሮች መቀነሻ የሚሆን 👍በአግባቡ እየመጠንን ለመጠቀም ተመራጭ ዋጋ፦ 💲🏷 3ፍሬ➡ 400 ብር ♦ውስን ፍሬ ነው ያለው 🏢 አድራሻ ♦ ቢሮ ቁ. S05/S06 💧💧💧💧 📲 0902660722 📲 0928460606 🔖 💬በTelegram ለማዘዝ ⤵ ይጠቀሙ🔽 ለተጨማሪ ማብራሪያ የቴሌግራም ገፃችን⤵ ...................................', '................................... 1 pairs Sneaker Crease Protector ዋጋ፦

In [20]:
clean_df

Unnamed: 0,text_cleaned
0,................................... Saachi Ele...
1,................................... 3pcs Bottl...
2,................................... 3pcs Bottl...
3,................................... 1 pairs Sn...
4,................................... 1 pairs Sn...
...,...
791,"**ቴሌግራም****🫐**** ** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" **..."
792,"**ቴሌግራም****⭐**** **** ""በአዲስ ነገር ሁሌም ቀዳሚዎች ነን"" ..."
793,"**ቴሌግራም****⭐**** **** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" ..."
794,"**ቴሌግራም****🍀**** **** ""በአዲስ ነገረ ሁሌም ቀዳሚዏች ነን"" ..."
