# Scrapper for Data Acquisition from Mastodon

## https://mastodon.social/explore

## Images

In [None]:
import time
import os
import requests
from selenium import webdriver

# Set up the Chrome webdriver
chrome_driver_path = "D:\\New\\Data Science\\Assignment_1\\Scrapper_3\\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver_path)

# Navigate to the Mastodon explore page
driver.get("https://mastodon.social/explore")

# Define the number of posts to scrape
num_posts_to_scrape = 3
posts_scraped = 0

# Create a directory to store downloaded images
os.makedirs("images", exist_ok=True)

# Scroll to load more posts
while posts_scraped < num_posts_to_scrape:
    # Scroll down to load more posts
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait for the page to load

    # Find all images on the page
    images = driver.find_elements_by_xpath("//img")

    # Loop through each image
    for image in images:
        try:
            # Get the image source
            image_url = image.get_attribute("src")

            # Download the image
            if image_url:
                image_filename = f"images/image_{posts_scraped + 1}.jpg"
                with open(image_filename, "wb") as f:
                    f.write(requests.get(image_url).content)
                print(f"Downloaded image: {image_filename}")
                posts_scraped += 1

        except Exception as e:
            print(f"Error downloading image: {str(e)}")

        if posts_scraped >= num_posts_to_scrape:
            break

# Close the browser
driver.quit()


# --------------------------------------------------------------------------------------------------------------

## Text Data

In [None]:
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

# Set up the Chrome webdriver
chrome_driver_path = "D:\\New\\Data Science\\Assignment_1\\Scrapper_3\\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver_path)

# Navigate to the Mastodon explore page
driver.get("https://mastodon.social/explore")

# Define the number of posts to scrape
num_posts_to_scrape = 3
posts_scraped = 0

# Scroll to load more posts
while posts_scraped < num_posts_to_scrape:
    # Scroll down to load more posts
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Wait for the page to load

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Find all text content within each post
    posts = soup.find_all("div", class_="status__content")

    for post_index, post in enumerate(posts, start=1):
        try:
            # Extract text content from the post
            post_text = post.get_text(separator='\n').strip()

            # Save text content to a text file
            text_filename = f"text/post_{posts_scraped + post_index}.txt"
            with open(text_filename, "w", encoding="utf-8") as f:
                f.write(post_text)
            print(f"Saved text content to: {text_filename}")

            posts_scraped += 1

        except Exception as e:
            print(f"Error processing post: {str(e)}")

        if posts_scraped >= num_posts_to_scrape:
            break

# Close the browser
driver.quit()


# --------------------------------------------------------------------------------------------------------------

## Audio Links

In [None]:
import csv
import time
import os
from selenium import webdriver

# Set up the Chrome webdriver
chrome_driver_path = "D:\\New\\Data Science\\Assignment_1\\Scrapper_3\\chromedriver.exe"
driver = webdriver.Chrome(executable_path=chrome_driver_path)

# Navigate to the audio page
driver.get("https://mastodon.social/explore")

# Define the number of audio files to scrape
num_audio_to_scrape = 1
audio_scraped = 0

# Create an audio directory if it doesn't exist
audio_directory = "audio"
os.makedirs(audio_directory, exist_ok=True)

# CSV file path
csv_filename = os.path.join(audio_directory, "audio_links.csv")
csv_exists = os.path.exists(csv_filename)

# Open CSV file in append mode
with open(csv_filename, 'a', newline='') as csvfile:
    fieldnames = ['Audio URL']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Write header only if the file doesn't exist
    if not csv_exists:
        writer.writeheader()

    # Find all audio elements on the page
    audio_elements = driver.find_elements_by_xpath("//audio/source")

    # Loop through each audio element
    for audio in audio_elements:
        try:
            # Get the audio source
            audio_url = audio.get_attribute("src")

            # Check if the URL ends with .mp3, .wav, or .ogg
            if audio_url and audio_url.endswith(('.mp3', '.wav', '.ogg')):
                # Write audio URL to CSV
                writer.writerow({'Audio URL': audio_url})
                print(f"Audio URL added to CSV: {audio_url}")
                audio_scraped += 1

        except Exception as e:
            print(f"Error processing audio: {str(e)}")

# Close the browser
driver.quit()


# --------------------------------------------------------------------------------------------------------------

## Video Links

In [None]:
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver

def scroll_page(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Wait for the page to load
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def main():
    try:
        # Set up the Chrome webdriver
        chrome_driver_path = "D:\\New\\Data Science\\Assignment_1\\Scrapper_3\\chromedriver.exe"
        driver = webdriver.Chrome(executable_path=chrome_driver_path)

        # Navigate to the Mastodon explore page
        driver.get("https://mastodon.social/explore/")

        # Define the number of posts to scrape
        num_posts_to_scrape = 1
        posts_scraped = 0

        # Create a directory to store downloaded video links
        videos_folder = "videos"
        os.makedirs(videos_folder, exist_ok=True)

        # Text file to store video links
        txt_filename = os.path.join(videos_folder, "video_links.txt")

        # Scroll to load more posts
        while posts_scraped < num_posts_to_scrape:
            # Scroll down to load more posts
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait for the page to load

            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Find all video content within each post
            videos = soup.find_all("video")

            with open(txt_filename, "a", encoding="utf-8") as txt_file:
                for video in videos:
                    try:
                        # Extract video links from the video tag
                        video_url = video.get("src")
                        if video_url:
                            # Write video links to the text file
                            txt_file.write(video_url + "\n")
                            print(f"Saved video link: {video_url}")

                            posts_scraped += 1

                    except Exception as e:
                        print(f"Error processing video link: {str(e)}")

                    if posts_scraped >= num_posts_to_scrape:
                        break

    except Exception as e:
        print("An error occurred:", e)

    finally:
        # Close the browser
        driver.quit()

if __name__ == "__main__":
    main()


# --------------------------------------------------------------------------------------------------------------

# Medium Link

## https://medium.com/@waqasdost/scraping-data-from-mastodon-social-explore-page-a8fb00eba5a2

## Citation: Idea Learing from DataCamp and youtube, Code by ChatGPT3.5