In [67]:
import queue
import json
import sys
import csv
import re
import bs4

import util

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
#from selenium.webdriver.common.keys import Keys
#from selenium.webdriver.common.by import By

import requests
import urllib
import time
import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO
import random

In [68]:
# def dict_to_csv(data_iter, index_filename):
#     """
#     Creates a csv file from a dictionary, mapping integers to indvidual words
#     Inputs:
#         data_iter (iterable): maps course index to a set of words
#         index_file_name (str): name of the csv file
#     Returns: None, creates a csv file
#     """
#     with open(index_filename, 'w', newline='', encoding="utf-8") as csvfile:
#         csv_writer = csv.writer(csvfile, delimiter='|')
#         for course_index, word_set in data_iter.items():
#             for word in word_set:
#                 csv_writer.writerow([course_index, word])

In [None]:
### HELPERS ###
# OVERALL TASK IS TO SCRAPE REDDIT FOR POSTS AND COMMENTS

## SHOULD CHANGE THE ATTRS FUNCs TO ADD SUBREDDIT##
### HELPERS ###
# OVERALL TASK IS TO SCRAPE REDDIT FOR POSTS AND COMMENTS

## SHOULD CHANGE THE ATTRS FUNCs TO ADD SUBREDDIT##

## SHOULD ALSO ADD WHETHER OR NOT IS COMMENT AND LINK TO ORIGINAL POST

def get_next_page(soup):
    """
    Find the next page of a subreddit
    Input: soup (bs4 soup): an html soup of a subreddit page
    Returns str or None if it finds a next page
    """
    next_page = soup.find("span", class_="next-button")
    if next_page:
        return next_page.find("a").get("href")
    else:
        return None
    

def reddit_crawler(domain_URL, r_headers, wait_times, max_pages, csv_filename):
    """
    Crawls a reddit by page, obtaining a link for each page on subreddit
    returns a tuple of the visited_pages, visited_pages_soup, urls_to_visit
    Inputs:
        domain_URL (str): the URL of the subreddit
        r_headers (dict): headers for the request
        wait_times (tuple of ints): min and max time to wait between requests
        max_pages (int): limits the number of pages to crawl
        csv_filename (str): the name of the csv file to store page soups
    Returns: 
        visited_pages (list) of page urls
        visited_pages_soup (list) of page soups (same len as visited_pages)
        urls_to_visit (list) of all comment urls on every page 
        
        also creates a .csv file with the link and soup for the webpage
    """
    visited_pages = []
    urls_to_visit = []
    visited_pages_soup = []
    num_pages_visited = 0
    curr_url = domain_URL # initialize curr_url as domain_URL
    min_time, max_time = wait_times
    
    # create a csv file
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.writer(csvfile)

        # Check if the file is empty (write header if needed)
        if csvfile.tell() == 0:
            csv_writer.writerow(["link", "soup"])
        
        # while the number of pages is less than the max
        while num_pages_visited < max_pages:
            # add current page to visited pages
            num_pages_visited += 1

            print("Pages visited:", num_pages_visited)

            # get request for webpage
            request = requests.get(curr_url, headers=r_headers)

            # give the server a break
            time.sleep(random.uniform(min_time, max_time))

            # if request is valid
            if request.status_code == 200:
                soup_page = bs4.BeautifulSoup(request.text, "html.parser")
                
                # update the CSV with this new url and page
                csv_writer.writerow([curr_url, soup_page])

                # updated visited pages soup list
                visited_pages_soup.append(soup_page)
                visited_pages.append(curr_url)

                # find all the comment links
                links_html = soup_page.find_all("a", class_ = re.compile(r"bylink comments"))
                links = [link.get("href") for link in links_html]
                # add links to visited_urls
                urls_to_visit += links

                next_page = get_next_page(soup_page)
                if next_page:
                    curr_url = next_page
                else:
                    break
            
            # if unable to get current URL request, return visited URLs
            else:
                print(f"Response {request.status_code}. Couldn't get URL: {curr_url}.")
                break
        
    return visited_pages, visited_pages_soup, urls_to_visit


def find_post_attrs(post):
    '''
    For posts, find important attributes for a Reddit post,
    Returns a list of attributes for a post.
    Inputs:
        post (bs4 soup): a soup of a reddit post
    Returns a list of the attributes of the post
    '''
    if not post:
        return [None]*9
    # find the user
    try:
        user = post.find("div").get("data-author")
    except:
        user = None
    try:
        user_flair = post.find("p", class_="tagline").find("span", class_=re.compile("flair")).text
    except:
        user_flair = None

    try:
        title = post.find("a", class_=re.compile("title")).text
    except:
        title = None
    
    try:
        post_text = post.find("div", class_="md").text
    except:
        post_text = None
    
    try:
        post_date = post.find("div").find("time").get("datetime")
    except:
        post_date = None

    try:
        post_flair = post.find("p", class_=re.compile("title")).find("span", class_=re.compile("flair")).text
    except:
        post_flair = None

    try:
        score = post.find("div", class_="score unvoted").text
    except:
        score = None

    try: 
        n_comments = post.find("div").get("data-comments-count")

    except:
        n_comments = None

    try:
        link = post.find("a", class_=re.compile(r"comments")).get("href")
    except:
        link = None
    post_attrs = [user, user_flair, title, post_text, 
                  post_date, post_flair, score, n_comments, link]

    return post_attrs


def find_com_attrs(comment, link):
    '''
    For comments, find important attributes for a Reddit comment,
    Returns a list of attributes for a comment.
    Inputs:
        comment (bs4 soup): a soup of a reddit comment
        link (str): the link to the original post
    Returns a list of the attributes of the comment
    '''
    if not comment:
        return [None]*9
    try:
        user = comment.find("a", class_=re.compile("author")).text
    except:
        user = None
    try:
        user_flair = comment.find("span", class_= re.compile(r"flair")).get("title")
    except:
        user_flair = None
    try:
        com_text = comment.find("div", class_="md").text
    except:
        com_text = None
    try:
        com_date = comment.find("time").get("datetime")
    except:
        com_date = None

    try:
        com_score = int(comment.find("span", class_="score unvoted").get("title", 0))
    except:    
        com_score = None

    com_attrs = [user, user_flair, None, com_text, 
                 com_date, None, com_score, None, link]
    
    return com_attrs


def reddit_scraper(page_soups, r_headers, wait_times, csv_filename, max_com):
    '''
    Scrape all the post and comment text for a given subreddit page, return
    all post soups

    Inputs:
        page_soups (list): list of soups for each page
        r_headers (dict): headers for the request
        wait_times (tuple of ints): min and max time to wait between requests
        csv_filename (str): the name of the csv file to store post and comment data
        max_com (int): limits the number of comments per post to scrape
    Returns
        all_post_soups (list): list of soups for each post
        Returns a list of the soups for each post scraped
    '''
    min_time, max_time = wait_times
    all_post_soups = []
    
    # create CSV file
    with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        # initialize column names if blank
        if csvfile.tell() == 0:
            csv_writer.writerow(["user", "user_flair", "title", "post_text", 
                                 "post_date", "post_flair", "score", 
                                 "n_comments", "link", "is_comment"])
        
        # iterate over all webpage soups
        for page_soup in page_soups:
            site_table = page_soup.find("div", id="siteTable")
            posts = site_table.find_all("div", class_=re.compile(r"thing"))
            posts_links = [p.find("a", class_= re.compile("bylink")).get("href") for p in posts]

            for post_link in posts_links:
                time.sleep(random.uniform(min_time, max_time))
                
                request = requests.get(post_link, headers=r_headers)

                if request.status_code == 200:

                    # find the post
                    post_soup = bs4.BeautifulSoup(request.text, "html.parser")
                    post = post_soup.find("div", class_=re.compile(r"sitetable"))
                    post_attrs = find_post_attrs(post)
                    all_post_soups.append(post_soup)
                    csv_writer.writerow(post_attrs + [False])

                    # print("\t### MOVING ON TO COMMENT SECTION ###")
                    
                    # find the comment section
                    comment_section = post_soup.find("div", class_= "sitetable nestedlisting")
                    comments = comment_section.find_all("div", class_="entry unvoted")

                    for comment in comments[:max_com]:
                        com_attrs = find_com_attrs(comment, post_link)
                        
                        csv_writer.writerow(com_attrs + [True])
                    
                    # print(f"\t\t## STOPPED ##\n\t\t## MOVING TO NEXT POST ##")

                else:
                    # if we fail to get request, find the link that did it
                    print(f"!!!! FAILURE !!!!\nResponse {request.status_code}. Couldn't get URL: {post_link}.")                
    return all_post_soups


def get_data(domain_URL, r_headers, wait_times, max_pages, max_com, pages_csv, posts_csv):
    """
    Given a subreddit url, find all posts and comments within the time frame
    for debugging purposes, returns soups for all posts

    inputs:
        domain_URL (str): the URL of the subreddit
        r_headers (dict): headers for the request
        wait_times (tuple of ints): min and max time to wait between requests
        max_pages (int): limits the number of pages to crawl
        max_com (int): limits the number of comments per post to scrape
        pages_csv (str): the name of the csv file to store page soups
        posts_csv (str): the name of the csv file to store post and comment data

    Returns soups for all pages and creates two CSV files 
        one for tracking page soups, the other for the actual data

    """
    # get soup for EACH webpage in a subreddit
    _, soups, _ = reddit_crawler(domain_URL, r_headers, wait_times, max_pages, pages_csv)

    print("##### DONE CRAWLING #####\nMOVING ON TO SCRAPING FOR POSTS AND COMMENTS")

    post_soups = reddit_scraper(soups, r_headers, wait_times, posts_csv, max_com)
    
    print("##### DONE SCRAPING #####")
    
    return soups, post_soups

In [92]:
# RUN FOR REAL
soups = get_data("https://old.reddit.com/r/Adopted/", {"User-Agent": "Ethan K."}, (1, 2), 1000, 150, 'adopt_pages.csv', "adopt_posts.csv")

pd.Series(soups).to_csv("all_adopted_posts_soups.csv")

Pages visited: 1
Pages visited: 2
Pages visited: 3
Pages visited: 4
Pages visited: 5
Pages visited: 6
Pages visited: 7
Pages visited: 8
Pages visited: 9
Pages visited: 10
Pages visited: 11
Pages visited: 12
Pages visited: 13
Pages visited: 14
Pages visited: 15
Pages visited: 16
Pages visited: 17
Pages visited: 18
Pages visited: 19
Pages visited: 20
Pages visited: 21
Pages visited: 22
Pages visited: 23
Pages visited: 24
Pages visited: 25
Pages visited: 26
Pages visited: 27
Pages visited: 28
Pages visited: 29
Pages visited: 30
Pages visited: 31
Pages visited: 32
Pages visited: 33
Pages visited: 34
Pages visited: 35
Pages visited: 36
Pages visited: 37
Pages visited: 38
Pages visited: 39
Pages visited: 40
##### DONE CRAWLING #####
MOVING ON TO SCRAPING FOR POSTS AND COMMENTS
##### DONE SCRAPING #####


In [93]:
# TEST
soups2 = get_data("https://old.reddit.com/r/Adoption/", {"User-Agent": "Ethan Koz"}, (1.1, 2.1), 1000, 1500, 'adoption_pages.csv', "adoption_posts.csv")
pd.Series(soups2).to_csv("all_adoption_posts_soups.csv")

Pages visited: 1
Pages visited: 2
Pages visited: 3
Pages visited: 4
Pages visited: 5
Pages visited: 6
Pages visited: 7
Pages visited: 8
Pages visited: 9
Pages visited: 10
Pages visited: 11
Pages visited: 12
Pages visited: 13
Pages visited: 14
Pages visited: 15
Pages visited: 16
Pages visited: 17
Pages visited: 18
Pages visited: 19
Pages visited: 20
Pages visited: 21
Pages visited: 22
Pages visited: 23
Pages visited: 24
Pages visited: 25
Pages visited: 26
Pages visited: 27
Pages visited: 28
Pages visited: 29
Pages visited: 30
Pages visited: 31
Pages visited: 32
Pages visited: 33
Pages visited: 34
Pages visited: 35
Pages visited: 36
Pages visited: 37
Pages visited: 38
Pages visited: 39
Pages visited: 40
##### DONE CRAWLING #####
MOVING ON TO SCRAPING FOR POSTS AND COMMENTS
##### DONE SCRAPING #####


In [140]:
pd.read_csv("adoption_pages.csv").loc[39,"link"]

'https://old.reddit.com/r/Adoption/?count=975&after=t3_15u1ekj'

In [None]:
soups3 = get_data("https://old.reddit.com/r/Adoption/", {"User-Agent": "Ethan Koz"}, (1.1, 2.1), 1000, 1500, 'adoption_pages.csv', "adoption_posts.csv")
pd.Series(soups3).to_csv("all_adoption_posts_soups.csv")

In [None]:
# # finding all posts on second website
# soups[1]
# site_table = soups[1].find("div", id="siteTable")
# posts = site_table.find_all("div", class_=re.compile(r"thing"))

In [None]:
# links = [post.find("a", class_= re.compile("bylink")).get("href") for post in posts]

In [None]:
### OLD ###

# headers = {"User-Agent": "For school practice, ethanjkozlowski@uchicago.edu"}
# skipped = []
# all_post_soups = []

# with open("all_posts_new_1_30.csv", 'a', newline='', encoding='utf-8') as csvfile:
#     csv_writer = csv.writer(csvfile)
#     if csvfile.tell() == 0:
#         csv_writer.writerow(["user", "user_flair", "title", "post_text", 
#                   "post_date", "post_flair", "score", "n_comments", "link"])
    
#     for soup in soups:
#         site_table = soup.find("div", id="siteTable")
#         posts = site_table.find_all("div", class_=re.compile(r"thing"))
#         links = [post.find("a", class_= re.compile("bylink")).get("href") for post in posts]

#         for link in links:
#             time.sleep(1.8)
#             request = requests.get(link, headers = headers)
            
#             if request.status_code == 200:
#                 soup_2 = bs4.BeautifulSoup(request.text, "html.parser")
#                 post = soup_2.find("div", class_=re.compile(r"sitetable"))
#                 post_attrs = find_post_attrs(post)
#                 all_post_soups.append(soup_2)
#                 csv_writer.writerow(post_attrs)
#             else:
#                 skipped.append(link)

In [None]:
# #### OLD ####
# def reddit_crawler(domain_URL, r_headers, wait_time, max_pages, csv_filename):
#     """
#     Crawls a subreddit for all posts, and comments
#     """
    
#     all_posts = []
#     visited_urls = set()

#     # add the starting URL to the queue
#     curr_url = domain_URL
    
#     # code improved using Chat GPT3.5
#     # prompt: how to dynamically update a csv while you scrape a website
#     with open(csv_filename, 'a', newline='', encoding='utf-8') as csvfile:
#         # Create a CSV writer object
#         csv_writer = csv.writer(csvfile)
#         # Check if the file is empty (write header if needed)
#         if csvfile.tell() == 0:
#             csv_writer.writerow(['user',
#                                  'user_flair',
#                                  'title',
#                                  "post_link"
#                                  'post_text'
#                                  'post_date'
#                                  'post_flair'
#                                  'score',
#                                  "n_comments",
#                                  ])
        
#         while len(visited_urls) < max_pages:
#             time.sleep(wait_time)
#             request = requests.get(curr_url, headers = r_headers)
#             links = []
            
#             if request.status_code == 200:
#                 soup = bs4.BeautifulSoup(request.text, "html.parser")
                

#                 site_table = soup.find("div", id="siteTable")
#                 posts = site_table.find_all("div", class_=re.compile(r"thing"))
#                 links = [post.find("a", class_= re.compile("bylink")).get("href") for post in posts]

#                 for link in links:
#                     time.sleep(wait_time)
#                     request = requests.get(link, headers = r_headers)
                    
#                     if request.status_code == 200:
#                         soup = bs4.BeautifulSoup(request.text, "html.parser")
                        
#                         post = find_post_attrs(soup)
#                         # Write the scraped data to the CSV file
#                         csv_writer.writerow(post)
#                         all_posts.append(post)

#                 curr_url = get_next_page(soup)
                                    
#             else:
#                 print(f"Request could not be retrieved: {request}")
#     return 

In [None]:
# # testing function
# headers = {"User-Agent": "student practice"}
# a = reddit_crawler("https://old.reddit.com/r/Adopted/", headers, 10, 2, "test.csv")

In [None]:
# for link in problem_urls:
#     request = requests.get(link, headers = headers)

#     if request.status_code == 200:
#         soup = bs4.BeautifulSoup(request.text, "html.parser")
#         time.sleep(2)
#         print(find_post_attrs(soup))
# # 0 user, 1 user_flair, 2 title, 3 post_text, 4 post_date, 5 post_flair, 6 score, 7 n_comments

In [None]:
# soup.find("div", class_=re.compile(r"sitetable"))


In [None]:
# pd.read_csv("all_posts_with_href.csv")