In [1]:
import pandas as pd
import re
import asyncio
import os
from dotenv import load_dotenv
from telethon.sync import TelegramClient
from telethon.tl.functions.messages import GetHistoryRequest
from telethon.errors.rpcerrorlist import ChannelInvalidError, ChannelPrivateError
import nest_asyncio

# Apply nest_asyncio to allow running asyncio event loops within a Jupyter notebook
nest_asyncio.apply()

In [8]:
print("Loading credentials from .env file...")
load_dotenv()

# Get credentials securely from the environment
api_id = os.getenv("API_ID")
api_hash = os.getenv("API_HASH")
phone = os.getenv("PHONE_NUMBER")

# Check if the variables were loaded correctly
if not all([api_id, api_hash, phone]):
    raise ValueError("API_ID, API_HASH, or PHONE_NUMBER not found. Make sure you have a .env file in the project root with the correct values.")

print("Credentials loaded successfully.")


Loading credentials from .env file...
Credentials loaded successfully.


In [9]:
channels_to_scrape = [
    'ZemenExpress',
    'nevacomputer',
    'qnashcom',
    'helloomarketethiopia',
    'modernshoppingcenter'
]


In [10]:
async def fetch_messages(client, channel_identifier, limit=300):
    """
    Asynchronously fetches message history from a single Telegram channel using its ID or username.
    """
    all_messages = []
    channel_name_for_print = str(channel_identifier) # Default name for logging

    try:
        # This works whether you provide a username or an integer ID
        channel_entity = await client.get_entity(channel_identifier)
        channel_name_for_print = getattr(channel_entity, 'username', str(channel_identifier))
        
        async for message in client.iter_messages(channel_entity, limit=limit):
            # We only care about messages with text content
            if message.text:
                all_messages.append({
                    'channel_name': getattr(channel_entity, 'username', str(channel_identifier)),
                    'message_id': message.id,
                    'timestamp': message.date,
                    'text_original': message.text,
                    'views': message.views if message.views else 0,
                    'has_image': message.photo is not None
                })
        
        print(f"Successfully scraped {len(all_messages)} messages from @{channel_name_for_print}.")
        return all_messages

    except (ChannelInvalidError, ValueError):
        print(f"Error: Channel '@{channel_name_for_print}' not found or is invalid. Skipping.")
        return []
    except ChannelPrivateError:
        print(f"Error: Channel '@{channel_name_for_print}' is private. You must join it first. Skipping.")
        return []
    except Exception as e:
        print(f"An unexpected error occurred with @{channel_name_for_print}: {e}")
        return []

In [11]:
async def main():
    """Main function to coordinate the scraping process."""
    all_scraped_data = []
    
    # The 'async with' block handles connecting and disconnecting automatically
    # The session file will be created in the root directory.
    async with TelegramClient('session_name', api_id, api_hash) as client:
        print("Client created successfully. Starting to scrape channels...")
        for channel in channels_to_scrape:
            print(f"--- Processing channel: {channel} ---")
            channel_data = await fetch_messages(client, channel, limit=300) # Fetch up to 300 messages per channel
            all_scraped_data.extend(channel_data)

    if not all_scraped_data:
        print("\nScraping finished, but no data was collected. Please check your channel IDs/usernames and network.")
        return pd.DataFrame() # Return an empty dataframe
    
    df = pd.DataFrame(all_scraped_data)
    
    # Save the raw scraped data before any cleaning
    df.to_csv('../data/scraped_data.csv', index=False, encoding='utf-8-sig')
    print(f"\nScraping complete. Collected a total of {len(df)} messages.")
    print("Raw data saved to 'data/scraped_data.csv'")
    return df


In [12]:
# --- Step 6: Run the Scraper ---
print("Starting the scraping process...")
# Note: The first time you run this, Telethon will ask for your phone number,
# a login code sent to your Telegram app, and possibly your 2FA password.
df_scraped = asyncio.run(main())

if not df_scraped.empty:
    print("\n--- Starting Data Preprocessing ---")

    def clean_text(text):
        """A function to clean Amharic text for NER."""
        if not isinstance(text, str):
            return ""
        # Remove URLs and Telegram links
        text = re.sub(r'http\S+|www\S+|t\.me/\S+', '', text, flags=re.MULTILINE)
        # Remove user mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)
        # Remove specific decorative characters and common emojis
        text = re.sub(r'[💥📌💵✅👉📍📞☎️👇✨✔®©™❤🔥]', '', text)
        # Replace multiple newlines/whitespace with a single space
        text = re.sub(r'[\n\r\s]+', ' ', text).strip()
        return text

    print("Cleaning text data...")
    df_scraped['text_cleaned'] = df_scraped['text_original'].apply(clean_text)
    
    # Save the final preprocessed data
    df_scraped.to_csv('../data/preprocessed_data.csv', index=False, encoding='utf-8-sig')
    print("Preprocessing complete. Cleaned data saved to 'data/preprocessed_data.csv'")
    
    print("\n--- Preprocessed DataFrame Sample ---")
    # Display a sample from different parts of the dataframe to see the variety
    print("Top 5 rows:")
    print(df_scraped[['channel_name', 'text_cleaned', 'views']].head())
    print("\nRandom 5 rows:")
    print(df_scraped[['channel_name', 'text_cleaned', 'views']].sample(5))
    print("\nLast 5 rows:")
    print(df_scraped[['channel_name', 'text_cleaned', 'views']].tail())
else:
    print("\nSkipping preprocessing because no data was scraped.")

Starting the scraping process...


Attempt 1 at connecting failed: TimeoutError: 


Signed in successfully as Desta Getaw; remember to not break the ToS or you will risk an account ban!
Client created successfully. Starting to scrape channels...
--- Processing channel: ZemenExpress ---
Successfully scraped 117 messages from @ZemenExpress.
--- Processing channel: nevacomputer ---
Successfully scraped 92 messages from @nevacomputer.
--- Processing channel: qnashcom ---
Successfully scraped 231 messages from @qnashcom.
--- Processing channel: helloomarketethiopia ---
Successfully scraped 276 messages from @helloomarketethiopia.
--- Processing channel: modernshoppingcenter ---
Successfully scraped 80 messages from @modernshoppingcenter.

Scraping complete. Collected a total of 796 messages.
Raw data saved to 'data/scraped_data.csv'

--- Starting Data Preprocessing ---
Cleaning text data...
Preprocessing complete. Cleaned data saved to 'data/preprocessed_data.csv'

--- Preprocessed DataFrame Sample ---
Top 5 rows:
   channel_name                                       text_