## Imports

In [1]:
import requests
import pandas as pd
import time

## Scraping of subreddit posts using PushShift API

In [2]:
# scraping posts from r/theonion and r/nottheonion
subreddits = ["theonion","nottheonion"]
url = "https://api.pushshift.io/reddit/search/submission"
params = {"size": 100}
posts_master = []
iter_count = 0
posts_to_scrape = 2000

for subreddit in subreddits:    
    params["subreddit"] = subreddit
    # scraping posts from before 01 Jan 2022 12:00:00 AM GMT
    # 01 Jan 2022 12:00:00 AM GMT is 1640995200 in unix time
    params["before"] = 1640995200
    posts = []
    url_titles_set = set()
    
    while len(posts) < posts_to_scrape:
        # saving the length of posts at the start of the iteration to print progress
        posts_start_len = len(posts)
        
        # PushShift's request limit is 60 requests per minute as of Jan 2022
        # time.sleep(1) will ensure that the fastest request rate would be 1 request per second so it will not exceed 60 requests per minute
        time.sleep(1)

        res = requests.get(url, params)
        data = res.json()
        
        # looping through the response to check for reposts
        for i in range(len(data['data'])):                        
            # posts could be reposts for karma farming, we will exclude reposts so that there are no repeated data points
            # reposts can take the form of same title same url, same title different url or different title same url
            # only append to posts if it is not a repost
            if data['data'][i]['title'] not in url_titles_set and data['data'][i]['url'] not in url_titles_set:
                posts.append(data['data'][i])
                # adding the title and url to a set for repost searching
                url_titles_set.add(data['data'][i]['title'])
                url_titles_set.add(data['data'][i]['url'])
            
            # exit loop once 1500 posts have been added
            if len(posts) >= posts_to_scrape:
                break
        
        # print progress of scraping and request status code
        print(f"Posts #{posts_start_len + 1} to #{len(posts)} from r/{subreddit} request status code: {res.status_code}")
        # setting "before" to the earliest post we have scraped so that we don't scrape the same posts again
        params["before"] = posts[len(posts)-1]['created_utc']
    
    # extend() instead of append() as we don't want list of lists
    posts_master.extend(posts)
    # separator for progress printing
    print("=====")

Posts #1 to #98 from r/theonion request status code: 200
Posts #99 to #193 from r/theonion request status code: 200
Posts #194 to #287 from r/theonion request status code: 200
Posts #288 to #382 from r/theonion request status code: 200
Posts #383 to #476 from r/theonion request status code: 200
Posts #477 to #571 from r/theonion request status code: 200
Posts #572 to #666 from r/theonion request status code: 200
Posts #667 to #763 from r/theonion request status code: 200
Posts #764 to #845 from r/theonion request status code: 200
Posts #846 to #939 from r/theonion request status code: 200
Posts #940 to #1033 from r/theonion request status code: 200
Posts #1034 to #1128 from r/theonion request status code: 200
Posts #1129 to #1220 from r/theonion request status code: 200
Posts #1221 to #1311 from r/theonion request status code: 200
Posts #1312 to #1404 from r/theonion request status code: 200
Posts #1405 to #1494 from r/theonion request status code: 200
Posts #1495 to #1585 from r/theon

In [3]:
df = pd.DataFrame(posts_master)
print(df.shape)
df.head()

(4000, 77)


Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,media,media_embed,secure_media,secure_media_embed,gallery_data,is_gallery,media_metadata,author_flair_background_color,author_flair_text_color,author_cakeday
0,[],False,mothershipq,,[],,text,t2_4negm,False,False,...,,,,,,,,,,
1,[],False,-ImYourHuckleberry-,,[],,text,t2_g3p2c,False,False,...,,,,,,,,,,
2,[],False,dwaxe,,[],,text,t2_3jamc,False,False,...,,,,,,,,,,
3,[],False,dwaxe,,[],,text,t2_3jamc,False,False,...,,,,,,,,,,
4,[],False,dwaxe,,[],,text,t2_3jamc,False,False,...,,,,,,,,,,


In [4]:
# Verifying that 3000 posts from each subreddit were scraped
df['subreddit'].value_counts()

nottheonion    2000
TheOnion       2000
Name: subreddit, dtype: int64

In [5]:
# Verifying that 3000 unique posts were scraped from each subreddit i.e. no duplicate posts
df.groupby('subreddit')['id'].nunique()

subreddit
TheOnion       2000
nottheonion    2000
Name: id, dtype: int64

In [6]:
# Verifying that 3000 unique posts were scraped from each subreddit i.e. no reposts based on post title
df.groupby('subreddit')['title'].nunique()

subreddit
TheOnion       2000
nottheonion    2000
Name: title, dtype: int64

In [7]:
# Verifying that 3000 unique posts were scraped from each subreddit i.e. no reposts based on post url
df.groupby('subreddit')['url'].nunique()

subreddit
TheOnion       2000
nottheonion    2000
Name: url, dtype: int64

In [8]:
# looking at the columns and which ones have null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 77 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   all_awardings                  4000 non-null   object 
 1   allow_live_comments            4000 non-null   bool   
 2   author                         4000 non-null   object 
 3   author_flair_css_class         3 non-null      object 
 4   author_flair_richtext          3949 non-null   object 
 5   author_flair_text              1 non-null      object 
 6   author_flair_type              3949 non-null   object 
 7   author_fullname                3949 non-null   object 
 8   author_is_blocked              2564 non-null   object 
 9   author_patreon_flair           3949 non-null   object 
 10  author_premium                 3949 non-null   object 
 11  awarders                       4000 non-null   object 
 12  can_mod_post                   4000 non-null   b

In [9]:
# Keeping only the columns we want
df=df[['author','created_utc','domain','full_link','id','subreddit','title','url']]

In [10]:
# checking for null values
df.isna().sum()

author         0
created_utc    0
domain         0
full_link      0
id             0
subreddit      0
title          0
url            0
dtype: int64

No null values found

## Export to CSV

In [11]:
df.to_csv('../data/theonion_nottheonion_posts.csv',index = False,encoding='utf-8-sig')