## Telegram data scraping
### Import Required Modules

In [1]:
# Import required modules
import os
import sys
import asyncio
import nest_asyncio
import logging
import pandas as pd
from dotenv import load_dotenv
from telethon import TelegramClient

current_dir = os.getcwd()
# Append the parent directory to sys.path
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

# ignore warrnings
import warnings
warnings.filterwarnings("ignore")

#### Import Custom Scripts

In [2]:
# Importing custom scripts for image downloading and Telegram data scraping
from scripts.photo_scraper import download_images
from scripts.telegram_data_scraping import scrape_messages

In [3]:
# Apply nest_asyncio to handle event loop issues in Jupyter
nest_asyncio.apply()

In [4]:
# Load environment variables from .env file
load_dotenv()

True

#### Fetch API Credentials from .env

In [5]:
# Fetch API credentials from the .env file using environment variables
api_id = os.getenv('API_ID')
api_hash = os.getenv('API_HASH')
phone_number = os.getenv('PHONE_NUMBER')

#### Configure Logging

In [6]:
# Set up logging to log messages to both a file and the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("../logs/scraping.log"),  # Save logs to a file
        logging.StreamHandler()  # Output logs to console
    ]
)

In [7]:
# Initialize the Telegram client using environment variables
client = TelegramClient('session_name', api_id, api_hash)

#### Function to Scrape Messages from Channels

In [9]:
async def scrape_channels(channel_urls):
    """
    Scrapes images and messages from a list of Telegram channels.
    Args:
        channel_urls (list): List of Telegram channel URLs.
    Returns:
        final_df (DataFrame): Combined DataFrame containing messages from all channels.
    """
    all_messages = []

    for channel_url in channel_urls:
        # Scrape and download images
        # await scrape_images(channel_url)
        
        # Scrape messages and store in a DataFrame
        df = await scrape_messages(channel_url, client)
        all_messages.append(df)

    # Combine all DataFrames
    final_df = pd.concat(all_messages, ignore_index=True)
    return final_df

#### Function to Scrape Images from a Channel

In [8]:
# Function to scrape images from a channel
async def scrape_images(channel_url, client):
    """
    Scrape images from a Telegram channel and download them.

    Args:
        channel_url: The URL of the Telegram channel.
        client: The Telegram client used to interact with the Telegram API.
    """
    try:
        logging.info(f"Scraping images from channel: {channel_url}")
        
        # Get the channel entity
        channel = await client.get_entity(channel_url)
        channel_name = channel.username or channel.title.replace(' ', '_')

        # Scrape messages and download images
        async for message in client.iter_messages(channel):
            await download_images(message, channel_name, client)
        
        logging.info(f"Finished scraping images from channel: {channel_url}")
    except Exception as e:
        logging.error(f"Error scraping images from channel {channel_url}: {e}")

#### Run the Scraping Process in the Notebook

In [14]:
async def main():
    """
    Asynchronous function to run the scraping process for multiple Telegram channels.
    
    Scrapes messages from the specified channels, saves the data to a CSV file, and returns the DataFrame.

    Returns:
        df (DataFrame): DataFrame containing scraped messages from all specified Telegram channels.
    """
    channel_urls = ['t.me/DoctorsET', 't.me/lobelia4cosmetics', 't.me/CheMed123', 't.me/EAHCI', 'yetenaweg']  
    try:
        # Start the Telegram client
        await client.start(phone=phone_number)
        
        # Scrape channels and get the final DataFrame
        df = await scrape_channels(channel_urls)
        
        # Save the DataFrame to a CSV file
        df.to_csv('../data/scraped_telegram_messages.csv', index=False)
        
        logging.info("Data successfully saved to '../data/scraped_telegram_messages.csv'")
        return df
    except Exception as e:
        logging.error(f"Error in main process: {e}")

#### Execute the Main Function

In [15]:
# Run the main function
await main()

2024-10-11 23:00:29,014 - INFO - Scraping messages from channel: t.me/DoctorsET
2024-10-11 23:00:44,350 - INFO - Finished scraping messages from channel: t.me/DoctorsET
2024-10-11 23:00:44,353 - INFO - Scraping messages from channel: t.me/lobelia4cosmetics
2024-10-11 23:01:10,577 - INFO - Finished scraping messages from channel: t.me/lobelia4cosmetics
2024-10-11 23:01:10,579 - INFO - Scraping messages from channel: t.me/CheMed123
2024-10-11 23:01:11,396 - INFO - Finished scraping messages from channel: t.me/CheMed123
2024-10-11 23:01:11,397 - INFO - Scraping messages from channel: t.me/EAHCI
2024-10-11 23:01:33,478 - INFO - Finished scraping messages from channel: t.me/EAHCI
2024-10-11 23:01:33,480 - INFO - Scraping messages from channel: yetenaweg
2024-10-11 23:01:45,443 - INFO - Finished scraping messages from channel: yetenaweg
2024-10-11 23:01:46,558 - INFO - Data successfully saved to '../data/scraped_telegram_messages.csv'


Unnamed: 0,message_idid,text,sender,channel,date
0,DoctorsET_864,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,-1001102021238,DoctorsET,2023-12-18 17:04:02+00:00
1,DoctorsET_863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,-1001102021238,DoctorsET,2023-11-03 16:14:39+00:00
2,DoctorsET_862,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,-1001102021238,DoctorsET,2023-10-02 16:37:39+00:00
3,DoctorsET_861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,-1001102021238,DoctorsET,2023-09-16 07:54:32+00:00
4,DoctorsET_860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,-1001102021238,DoctorsET,2023-09-01 16:16:15+00:00
...,...,...,...,...,...
6355,yetenaweg_5,ይህ አዲሱ የኮሮና ቫይረስ በማይክሮስኮፕ ስር ሲታይ ያለው ምስል ነው። ኮ...,-1001447066276,yetenaweg,2020-02-17 20:58:59+00:00
6356,yetenaweg_4,አዲሱ የኮሮና ቫይረስ (በአዲስ የሳይንስ ስሙ COVID-19) ፣\nከየት ...,-1001447066276,yetenaweg,2020-02-17 20:55:46+00:00
6357,yetenaweg_3,,-1001447066276,yetenaweg,2020-02-17 20:55:05+00:00
6358,yetenaweg_2,,-1001447066276,yetenaweg,2020-01-17 01:35:09+00:00


### Extract Images from telegram channels

In [12]:
# Asynchronous main function to run the scraping process
async def main():
    channel_urls = ['t.me/lobelia4cosmetics', 't.me/CheMed123']
    try:
        # Start the Telegram client
        await client.start(phone=phone_number)
        
        # Scrape channels
        for channel_url in channel_urls:
            await scrape_images(channel_url, client)
    except Exception as e:
        logging.error(f"Error in main process: {e}")

In [13]:
await main()

2024-10-14 19:38:17,076 - INFO - Connecting to 149.154.167.51:443/TcpFull...
2024-10-14 19:38:19,648 - INFO - Connection to 149.154.167.51:443/TcpFull complete!
2024-10-14 19:38:21,249 - INFO - Phone migrated to 4
2024-10-14 19:38:21,515 - INFO - Reconnecting to new data center 4
2024-10-14 19:38:21,874 - INFO - Disconnecting from 149.154.167.51:443/TcpFull...
2024-10-14 19:38:21,879 - INFO - Disconnection from 149.154.167.51:443/TcpFull complete!
2024-10-14 19:38:21,881 - INFO - Connecting to 149.154.167.92:443/TcpFull...
2024-10-14 19:38:24,462 - INFO - Connection to 149.154.167.92:443/TcpFull complete!
2024-10-14 19:41:18,148 - INFO - Scraping images from channel: t.me/CheMed123


Signed in successfully as £Gech 👫; remember to not break the ToS or you will risk an account ban!


2024-10-14 19:41:19,247 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-14 19:41:22,072 - INFO - Downloaded image from message 97 in CheMed123
2024-10-14 19:41:22,097 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-14 19:41:24,671 - INFO - Downloaded image from message 96 in CheMed123
2024-10-14 19:41:24,676 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-14 19:41:26,121 - INFO - Downloaded image from message 95 in CheMed123
2024-10-14 19:41:26,126 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-14 19:41:28,183 - INFO - Downloaded image from message 94 in CheMed123
2024-10-14 19:41:28,187 - INFO - Starting direct file download in chunks of 131072 at 0, stride 131072
2024-10-14 19:41:30,116 - INFO - Downloaded image from message 93 in CheMed123
2024-10-14 19:41:30,208 - INFO - Starting direct file download in chunks of 131072 at 0, s

### End of Scraping