# Reddit Confession Scraper  

## Overview  
The **Reddit Scraper** is a web scraping tool designed to automate the process of logging into Reddit and extracting cofession from the community. Utilizing the Selenium WebDriver, this scraper navigates the Reddit website, interacts with the user interface, and collects relevant data from posts for analysis.  

## How It Works  

### Workflow Overview  

1. **Driver Initialization**:  
   - The scraper initializes a Selenium WebDriver instance configured to run in headless mode (without a graphical user interface).  
   - It sets up necessary options for Chrome, including disabling GPU acceleration and configuring the browser window size.  

2. **Login Process**:  
   - The scraper navigates to the Reddit login page and enters the provided username and password.  
   - It handles the login button, which is located within a Shadow DOM, using JavaScript to ensure successful interaction.  
   - After clicking the login button, it checks the current URL to confirm a successful login.  

3. **Scraping Posts**:  
   - The scraper iterates through a list of specified confessin community.   

4. **Data Extraction**:  
   - The scraper scrolls through the search results, extracting relevant data from each post, including subreddit, title, content, author, and url. 
    
5. **Data Storage**:  
   - Extracted data is stored in a structured format csv for further analysis.  

### Example of Extracted Data  

The scraper collects the following types of data for each post:  

| Field          | Description                                    |  
|----------------|------------------------------------------------|  
| subreddit      | Name of the specified confessson communigy     |  
| title          | The title of the post                          |  
| content        | The main body text of the post                 |  
| author         | The author of the post                         |  
| url            | link of of post                                |  


  

## Important Notes  

- **Temporary Accounts**: It is recommended to use temporary accounts for scraping to avoid potential violations of Reddit's tags of service.  
- **Dynamic Content**: The scraper relies on specific XPath and CSS selectors to extract data. Changes in Reddit's layout may require updates to these selectors.   

## Conclusion  

The Reddit Scraper provides an efficient way to gather and analyze social media data from the Reddit platform. By automating the login and data extraction processes, it enables users to gain valuable insights into trends and discussions in a timely manner.

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.options import Options
from datetime import datetime
import pandas as pd
import time

In [None]:
# Initialize WebDriver with or wihtout Headless Mode
def init_driver(chromedriver_path):
    chrome_options = Options()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--log-level=3")
    chrome_options.add_argument("--disable-web-security")  # Disable web security 
    chrome_options.add_argument("--window-size=1280,1024")
    chrome_options.add_argument("--ignore-certificate-errors")  # Ignore SSL errors  
    chrome_options.add_argument("--allow-insecure-localhost")  # Allow insecure localhost connections
    driver = webdriver.Chrome(executable_path=chromedriver_path, options=chrome_options)
    wait = WebDriverWait(driver, 10)
    print("driver intiated successfuly")
    return driver, wait

In [None]:
chromedriver_path = r"D:\coding\freelancing\stockMarket\chromedriver.exe"
driver, wait = init_driver(chromedriver_path)

In [None]:
# Log In to Reddit
def login_to_reddit(driver, wait, username, password):
    """
    Log in to Reddit with provided credentials.
    """
    try:
        driver.get("https://www.reddit.com/login/")
        time.sleep(5)
        # Enter username
        username_input = wait.until(EC.presence_of_element_located((By.ID, "login-username")))
        username_input.send_keys(username)

        # Enter password
        password_input = wait.until(EC.presence_of_element_located((By.ID, "login-password")))
        password_input.send_keys(password)
        time.sleep(2)
        try:
        # Locate the login button inside the Shadow DOM using JavaScript
            login_button = driver.execute_script("""
                return document
                    .querySelector("body > shreddit-app > shreddit-overlay-display")
                    .shadowRoot.querySelector("shreddit-signup-drawer")
                    .shadowRoot.querySelector("shreddit-drawer > div > shreddit-async-loader > div > shreddit-slotter")
                    .shadowRoot.querySelector("#login > auth-flow-modal > div.w-100 > faceplate-tracker > button");
            """)

            if login_button:
                # Click the login button
                driver.execute_script("arguments[0].click();", login_button)
                print("Login button clicked successfully!")
        except Exception:
            # Fallback to JavaScript click if standard click fails
            login_button_js = driver.find_element(By.XPATH, '//*[@id="login"]/auth-flow-modal/div[2]/faceplate-tracker/button/span/span')
            driver.execute_script("arguments[0].click();", login_button_js)
            print("Login button clicked using JavaScript!")

            # Wait for successful login
            time.sleep(5)
            if "login" in driver.current_url.lower():
                raise Exception("Login failed. Still on login page.")
                print("Login successful!")

    except Exception as e:
        print(f"Error during Reddit login: {e}")


In [None]:
# Credentials and configuration
username = "uname"
password = "pass"
login_to_reddit(driver, wait, username, password)

In [51]:
def click_all_collapsed_expando_buttons(driver):
    """
    Finds all buttons with class 'expando-button' and clicks those that are collapsed.
    
    Args:
        driver: Selenium WebDriver instance
    """
    try:
        # Find all elements with class name "expando-button"
        expando_buttons = driver.find_elements(By.CLASS_NAME, "expando-button")
        for btn in expando_buttons:
            # Check if the button has class "collapsed"
            if "collapsed" in btn.get_attribute("class"):
                # Click using JavaScript
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(0.5)  # small delay to mimic natural clicking
    except Exception as e:
        print(f"An error occurred while clicking: {e}")

In [52]:
import re
def extract_post_content(post):
    """Extract text content with multiple fallback methods"""
    content = ""
    
    # Method 1: Direct text extraction
    try:
        content_div = post.find_element(By.CSS_SELECTOR, "div.md")
        content = content_div.text
    except Exception:
        pass
    
    # Method 2: Handle cross-posts
    if not content:
        try:
            crosspost_content = post.find_element(By.CSS_SELECTOR, "div.crosspost-preview")
            content = crosspost_content.text
        except Exception:
            pass
    
    # Method 3: Fallback to usertext body
    if not content:
        try:
            usertext_div = post.find_element(By.CSS_SELECTOR, "div.usertext-body")
            content = usertext_div.text
        except Exception:
            pass
    
    # Clean and return content
    return clean_text(content) if content else ""

def clean_text(text):
    """Prepare text for translation"""
    # Remove Reddit-specific formatting
    text = re.sub(r"\[deleted\]|\[removed\]", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\*{2}(.*?)\*{2}", r"\1", text)  # Remove bold formatting
    text = re.sub(r"\*(.*?)\*", r"\1", text)  # Remove italic formatting
    text = re.sub(r"~~(.*?)~~", r"\1", text)  # Remove strikethrough
    text = re.sub(r"^>.*$", "", text, flags=re.MULTILINE)  # Remove blockquotes
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    
    return text

In [None]:
def scrape_subreddit(subreddit, pages=10, min_content_length=20):
    """Scrape a subreddit with improved content extraction"""
    print(f"Scraping r/{subreddit}...")
    data = []
    url = f"https://old.reddit.com/r/{subreddit}/top/?sort=top&t=all"
    driver.get(url)
    
    for _ in range(pages):
        try:
            # Wait for posts to load
            wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "thing"))
            )
            click_all_collapsed_expando_buttons(driver)
        except Exception:
            print("Timeout waiting for posts")
            break
        time.sleep(4)
        click_all_collapsed_expando_buttons(driver)
        time.sleep(2)
        posts = driver.find_elements(By.CLASS_NAME, "thing")
        
        for post in posts:
            try:
                # Skip promoted ads
                if "promoted" in post.get_attribute("class"):
                    continue
                title = post.find_element(By.CLASS_NAME, "title").text
                try:
                    author = post.get_attribute("data-author")
                except:
                    author = "N/a"
                post_url = post.get_attribute("data-permalink")
                
                content = extract_post_content(post)
                
                # Skip if content is too short
                if len(content) < min_content_length:
                    continue
                
                # Add to dataset
                data.append({
                    "subreddit": subreddit,
                    "title": title,
                    "content": content,
                    "author": author,
                    "url": "https://old.reddit.com" + post_url
                })
            except Exception as e:
                print(f"Error processing post: {str(e)[:100]}")
                continue
        
        # Go to next page
        try:
            next_button = driver.find_element(By.LINK_TEXT, "next ›")
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(2)  # Important: let page load
        except Exception:
            print("No more pages")
            break
            
    return data

In [55]:
# Main scraping process
if __name__ == "__main__":
    subreddits = ["confession"]
    # subreddits = ["confessions", "offmychest", "TrueOffMyChest", "guilt", "regret"]
    all_data = []
    
    for sub in subreddits:
        try:
            sub_data = scrape_subreddit(sub)
            all_data.extend(sub_data)
            print(f"Scraped {len(sub_data)} posts from r/{sub}")
        except Exception as e:
            print(f"Failed to scrape r/{sub}: {str(e)}")
    
    # Create DataFrame and save
    if all_data:
        df = pd.DataFrame(all_data)
        print(f"Total posts scraped: {len(df)}")
        
        # Final cleaning
        df["content"] = df["content"].apply(clean_text)
        df = df[df["content"].str.len() > 50]  # Final length filter
        
        # Save results
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        filename = f"reddit_confessions_{timestamp}.csv"
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved to {filename}")
    else:
        print("No data scraped")

Scraping r/confession...
Timeout waiting for posts
Scraped 94 posts from r/confession
Total posts scraped: 94
Saved to reddit_confessions_20250622_155344.csv
