In [1]:
import praw
import pandas as pd
import re
import time

### Form Subreddit Index
First, let's go through a comprehensive list of subreddits, and find meme subreddits from which we can pull images in bulk.  
According to this subreddit list I've pulled online (https://frontpagemetrics.com/list-all-subreddits),  
there are 3 million subreddits, so we need to filter them down quite a bit!!  
  
#### Index Parameters
* Subreddit must have 10,000+ subscribers
* Subreddit must contain the word "meme" in its name or description* 
* No subreddits about fandoms, YouTube/IG/TikTok personalities, specific professions or hobbies.    

That leaves us with political subreddits, subreddits devoted to specific memes, country/sexuality/religion/region-based meme/shitposting/circlejerk subreddits.

In [2]:
#Creating a list of subreddits to sample from
subreddits = pd.read_csv('subreddits-2021-07-23.csv')
# we'll filter based on: "meme" in description/title + min. 10k subs
# then manually filter to remove pages about pop culture/intellectual products, hobbies, etc, so it's only dedicated meme pages + political pages
meme_subs = subreddits[
    (subreddits['subs'] > 10000) &
    (
        subreddits['desc'].str.contains('meme') |
        subreddits['real_name'].str.contains('meme')
    )
]
#save the filtered dataset here, then review it
meme_subs.to_csv('meme-subreddits-2021.csv')

### Scraping images from subreddits

In [2]:
# initialize our reddit client
reddit = praw.Reddit(
    client_id='hS4CPqNExizMF7XJ1XlMBQ',
    client_secret='zMcrLvT31UBI6V-zl2IFC9uR3MRA-g',
    user_agent='reddit-meme-analysis 0.1.1 by /u/inkoh',
    username='inkoh',
    password='Songoku777'
)
reddit.read_only = True

In [3]:
#now, collect images
subreddit_index = pd.read_csv('subreddit-index.csv')
subreddits = subreddit_index["real_name"]

In [4]:
# mass post scraping method
def scrape_subreddits(subreddits):
    posts = []
    i = 0
    for sub in subreddits:
        sub_posts = scrape_posts_from_subreddit(sub)
        for post in sub_posts:
            posts.append(post)
        pd.DataFrame(posts).to_csv('scraped-posts-bulk.csv')
        time.sleep(15)
    return posts


def scrape_posts_from_subreddit(sub_name):
    posts = []
    try:
        query = reddit.subreddit(sub_name).top(time_filter="year", limit=None)
        for submission in query:
            submission_is_image = re.search(
                '([^\\s]+(\\.(?i)(jpe?g|png|gif|bmp|webp))$)', submission.url)
            if submission_is_image:
                post_data = {
                    "reddit_id": submission.id,
                    "image_url": submission.url,
                    "title": submission.title,
                    "author": submission.author,
                    "subreddit": sub_name,
                    "permalink": "https://reddit.com/r/" + sub_name + "/comments/" + submission.id,
                    "num_comments": submission.num_comments,
                    "num_upvotes": submission.score,
                    "upvote_ratio": submission.upvote_ratio
                }
                posts.append(post_data)
    except:
        print("an error occured :(")
    finally:
        return posts


In [None]:
# execute mass scrape
# WARNING: takes a while (approx. 2.5hrs) to finish running
posts = scrape_subreddits(subreddits)

In [6]:
raw_posts = pd.read_csv('scraped-posts-bulk.csv')
shuffled_posts = raw_posts.sample(frac=1)
shuffled_posts.to_csv('raw-posts.csv')