## Import libraries and set up html request function for scraping

In [25]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [26]:
#html request
def html_data(url):
    html = requests.get(url, headers = {'User-Agent': 'whatever'})
    soup = BeautifulSoup(html.text, 'lxml')
    threads = soup.find_all(name = 'div', attrs = {'id':'siteTable','class':'sitetable linklisting'})
    
    return threads, soup

## Create functions for scraping reddit front page

In [76]:
def get_titles(threads, titles):
    for i in threads:
        x = i.find_all(name = 'p', attrs = {'class':'title'})
        for t in x:
               titles.append(t.a.text) 
    return titles
            
def get_subreddits(threads, subreddit):
    for i in threads:
        x = i.find_all(name = 'p', attrs = {'class':'tagline'})
        for t in x:
            subreddit.append(t.find('a', attrs = {'class':'subreddit hover may-blank'}).text)
    return subreddit

def get_dates(threads, dates):
    for i in threads:
        x = i.find_all(name = 'p', attrs = {'class':'tagline'})
        for d in x:
            dates.append(d.find(name = 'time')['datetime'])
    return dates

def get_vids(threads, vids):
    for i in threads:
        x = i.find_all(name = 'div', attrs = {'class':'top-matter'})
        for d in x:
            if d.find(name = 'div', attrs = {'class':'expando-button collapsed hide-when-pinned video'}) != None:
                vids.append(1)
            else:
                vids.append(0)
    return vids
    
def get_thread_url(threads, threadlink):
    for i in threads:
        x = i.find_all(name = 'p', attrs = {'class':'title'})
        for t in x:
               threadlink.append(t.a['href'])
    return threadlink
    
def get_comments(threads, comments):
    for i in threads:
        x = i.find_all(name = 'li', attrs = {'class':'first'})
        for j in x:
            comments.append(j.find(name = 'a').text.split()[0])
    return comments

#if no likes return NaN, process this with Pandas later(likely remove)
def get_likes(threads, likes):
    for i in threads:
        x = i.find_all(name = 'div', attrs = {'class':'score unvoted'})
        for j in x:
            try:
                likes.append(j['title'])
            except:
                likes.append('NaN')
    return likes

## Function to scrape and consolidate data into pandas dataframe

In [None]:
#scrape the data
def getFrontPageData(pages = 5, url = 'https://www.reddit.com/'):
    '''Web scrapes reddit for front page data, you can input how many pages you want to return.
       If this number exceeds the total number of pages available the function will return all results.
       
       If the scraper crashes when attempting to reach the next page it will output the current results
       of the dataframe. If you need to pick up at a later time you can input the url you want to start 
       at by using the url kwarg.'''
    
    #create lists/dict for information we gonna scrape
    titles, subreddit, dates, threadlink, vids, comments, likes = ([] for i in range(7))
    red_dict = {}
    
    #keeps track of results per page
    page_cnt = 0
   
    for page in range(0, pages):
        #catch soup and data about trending threads
        threads, soup = html_data(url)

        #try to get data, end function if error
        try:
            titles = get_titles(threads, titles)
            subreddit = get_subreddits(threads, subreddit)
            dates = get_dates(threads, dates)
            threadlink = get_thread_url(threads, threadlink)
            vids = get_vids(threads, vids)
            comments = get_comments(threads, comments)
            likes = get_likes(threads, likes)
        except:
            print("Getting Data Error!")
            break

        #get hyperlink for next page, if not available this is the end of the front page, end script
        #by breaking out of for loop and creating the dataframe, increase page count by 25
        try:
            page_cnt += 25
            for i in threads:
                x = i.find(name = 'div')['class'][2].split('-')[1]
            url = 'https://www.reddit.com/?count=' + str(page_cnt) + '&after=' + str(x)
        except:
            print("Next Page Error!")
            break
            
        #wait 1 sec, get next page data
        time.sleep(1)
        
        #print number of results in iterations of 1000
        if (page % 1000) == 0:
            print("Current Results", page)
    
    #create dictionary from lists for creating df
    red_dict['title'] = titles
    red_dict['subreddit'] = subreddit
    red_dict['submitted'] = dates
    red_dict['threadlink'] = threadlink
    red_dict['comments'] = comments
    red_dict['expands'] = vids
    red_dict['likes'] = likes

    #printing information where scrape ends
    print("Scraped results for", page + 1, "reddit page(s)")
    print("Returned", len(titles), "thread results")
    print("Last scraped Reddit page:", url)
    
    #return df to begin analysis
    return pd.DataFrame(red_dict)

## Scrape Reddit and return results as a pandas dataframe

In [None]:
%%time
#input of 10,000 pages should return 250,000 results if no errors
red = getFrontPageData(pages= 10000)

In [None]:
#save dataframe to csv in current directory so this does not have to be rerun
red.to_csv('reddit_threads.csv')

## Start analysis