In [1]:
# Import required modules
import os
import sys
import asyncio
import nest_asyncio
import logging
import pandas as pd
from dotenv import load_dotenv
from telethon import TelegramClient

current_dir = os.getcwd()
# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import custom scripts
from scripts.photo_scraper import download_images
from scripts.telegram_data_scraping import scrape_messages

In [3]:
# Apply nest_asyncio to handle event loop issues in Jupyter
nest_asyncio.apply()

In [4]:
# Load environment variables from .env file
load_dotenv()

True

In [5]:
# Fetch API credentials from .env
api_id = os.getenv('API_ID')
api_hash = os.getenv('API_HASH')
phone_number = os.getenv('PHONE_NUMBER')

In [6]:
# Initialize logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("scraping.log"),  # Save logs to a file
        logging.StreamHandler()  # Output logs to console
    ]
)

In [7]:
# Initialize the Telegram client using environment variables
client = TelegramClient('session_name', api_id, api_hash)

In [8]:
# Function to scrape images and messages from channels
async def scrape_channels(channel_urls):
    all_messages = []

    for channel_url in channel_urls:
        # Scrape and download images
        # await scrape_images(channel_url)
        
        # Scrape messages and store in DataFrame
        df = await scrape_messages(channel_url, client)
        all_messages.append(df)

    # Combine all DataFrames
    final_df = pd.concat(all_messages, ignore_index=True)
    return final_df

In [9]:
# Function to scrape images from a channel
async def scrape_images(channel_url):
    try:
        logging.info(f"Scraping images from channel: {channel_url}")
        # Get the channel entity
        channel = await client.get_entity(channel_url)
        channel_name = channel.username or channel.title.replace(' ', '_')

        # Scrape messages and download images
        async for message in client.iter_messages(channel):
            await download_images(message, channel_name, client)
        logging.info(f"Finished scraping images from channel: {channel_url}")
    except Exception as e:
        logging.error(f"Error scraping images from channel {channel_url}: {e}")

In [13]:
# Run the scraping in the notebook
async def main():
    channel_urls = ['t.me/DoctorsET', 't.me/lobelia4cosmetics', 't.me/CheMed123', 't.me/EAHCI', 'yetenaweg']  # Add your channel URLs here

    try:
        # Start the Telegram client
        await client.start(phone=phone_number)
        
        # Scrape channels and get the final DataFrame
        df = await scrape_channels(channel_urls)
        
        # Save the DataFrame to a CSV file
        df.to_csv('scraped_telegram_messages.csv', index=False)
        
        logging.info("Data successfully saved to 'scraped_telegram_messages.csv'")
        return df
    except Exception as e:
        logging.error(f"Error in main process: {e}")

In [14]:
# Run the main function
# df = await main()

2024-10-10 20:12:53,448 - INFO - Scraping messages from channel: t.me/DoctorsET
2024-10-10 20:13:02,933 - INFO - Finished scraping messages from channel: t.me/DoctorsET
2024-10-10 20:13:02,937 - INFO - Scraping messages from channel: t.me/lobelia4cosmetics
2024-10-10 20:13:29,127 - INFO - Finished scraping messages from channel: t.me/lobelia4cosmetics
2024-10-10 20:13:29,130 - INFO - Scraping messages from channel: t.me/CheMed123
2024-10-10 20:13:30,300 - INFO - Finished scraping messages from channel: t.me/CheMed123
2024-10-10 20:13:30,301 - INFO - Scraping messages from channel: t.me/EAHCI
2024-10-10 20:14:46,730 - INFO - Finished scraping messages from channel: t.me/EAHCI
2024-10-10 20:14:46,733 - INFO - Scraping messages from channel: yetenaweg
2024-10-10 20:15:00,576 - INFO - Finished scraping messages from channel: yetenaweg
2024-10-10 20:15:00,852 - INFO - Data successfully saved to 'scraped_telegram_messages.csv'


In [15]:
df

Unnamed: 0,id,text,sender,channel,date
0,864,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,-1001102021238,DoctorsET,2023-12-18 17:04:02+00:00
1,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,-1001102021238,DoctorsET,2023-11-03 16:14:39+00:00
2,862,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,-1001102021238,DoctorsET,2023-10-02 16:37:39+00:00
3,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,-1001102021238,DoctorsET,2023-09-16 07:54:32+00:00
4,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,-1001102021238,DoctorsET,2023-09-01 16:16:15+00:00
...,...,...,...,...,...
6364,5,ይህ አዲሱ የኮሮና ቫይረስ በማይክሮስኮፕ ስር ሲታይ ያለው ምስል ነው። ኮ...,-1001447066276,yetenaweg,2020-02-17 20:58:59+00:00
6365,4,አዲሱ የኮሮና ቫይረስ (በአዲስ የሳይንስ ስሙ COVID-19) ፣\nከየት ...,-1001447066276,yetenaweg,2020-02-17 20:55:46+00:00
6366,3,,-1001447066276,yetenaweg,2020-02-17 20:55:05+00:00
6367,2,,-1001447066276,yetenaweg,2020-01-17 01:35:09+00:00


In [17]:
# Run the scraping in the notebook
async def main():
    channel_urls = ['t.me/lobelia4cosmetics', 't.me/CheMed123']
    try:
        # Start the Telegram client
        await client.start(phone=phone_number)
        
        # Scrape channels and get the final DataFrame
        await scrape_channels(channel_urls)
        
    except Exception as e:
        logging.error(f"Error in main process: {e}")