In [76]:
import praw
import pandas as pd
import datetime

import xlsxwriter

import requests
import urllib
from tqdm import tqdm
import time
import os
from pmaw import PushshiftAPI


In [77]:
# Adds posts from "post_list" into "posts_dict"

def read_posts(post_list, posts_dict):
    t = tqdm(bar_format='{l_bar}{bar:50}{r_bar}{bar:-1b}')
    with t as progress_bar:
        for post in post_list:

            if (".jpg" in post.url or ".png" in post.url or ".tiff" in post.url or ".gif" in post.url):
                posts_dict["Image URL"].append(str(post.url))
            else:
                progress_bar.update()
                continue 

            # Title of each post
            posts_dict["Title"].append(post.title)

            # Author of each post
            posts_dict["Author"].append(str(post.author))

            # Time and Date the post was created
            posts_dict["Date"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%m/%d/%Y"))
            posts_dict["Time"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%H:%M:%S"))

            # Unique ID of each post
            posts_dict["ID"].append(post.id)

            # The score of a post
            posts_dict["Score"].append(post.score)

            # Total number of comments inside the post
            posts_dict["Comments"].append(str(post.num_comments))
            
            # Source data, same for all posts using this method
            posts_dict["Source"] = "Reddit"; 
            posts_dict["Page"].append(post.subreddit.display_name)
            
            progress_bar.update()
        

In [78]:
# Reads memes from a single subreddit and writes data to "posts_dict"

def read_subreddit(subreddit, posts_dict, chunk_size, reps, use_keywords):

    api = PushshiftAPI()

    # Iterating over the returned list of submissions
    for x in range(reps):
        if (x == 0):
            last_time = int(time.time())
        else:
            if (len(post_list) != 0):
                last_time = post_list[-1]["created_utc"]
            else:
                break

        # Deciding whether to apply keyword filters 
        if (use_keywords):
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                q="russia|lugansk|luhansk|donetsk|ukraine|pushilin|kyiv|kiev|donbass|kharkiv|odesa|dnipro|zaporizhzhia|lviv|crimea|kryvyi|mykolaiv|sebastopol|sevastopol|mariupol|luhansk|vinnytsia|makiivka|simferopol|chernihiv|kherson|poltava|lutsk|oblast|dnipropetrovsk|zhitomir|rivne|sumy|ternopil|volin|zaporizhzhya|azov|zelensky|shmyhal|kuleba|reznikov|rada|pasechnik|yanukovych|poroshenko|yushchenko|dnieper|dniester|chernobyl|dpr|lpr|nato|dnr|lnr|maidan",
                sort_type="created_utc"
            )
            print("using keywords")
        else:
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                sort_type="created_utc"
            )
            print("not using keywords")
        post_list = [post for post in posts]
        print(len(post_list))

        # Writing data to "posts_dict"
        read_posts(post_list, posts_dict)


    

In [79]:
# Converts a dictionary into a Dataframe

def get_dataframe(posts_dict):
    pd.set_option('display.max_rows', 2000)
    pd.set_option('max_colwidth', None)

    top_posts = pd.DataFrame(posts_dict)

    return top_posts

In [80]:
# Reads all the memes in a subreddit using PRAW (limited to 1000 memes per subreddit)

def simple_read(subreddit_name, posts_dict):
    
    # Login for Reddit PRAW
    reddit_read_only = praw.Reddit(
        client_id='JzbofoB1-Xrh3gbv--LVTQ',
        client_secret='v3R81WmJnSJJ8WWdjn4sZvRcerIVGg',
        user_agent='Ukraine Scraper'
    )
        
    meme_number = 1000 # max 1000
    
    subreddit = reddit_read_only.subreddit(subreddit_name)
    
#     posts = subreddit.top(time_filter='year', limit=meme_number)
    posts = subreddit.top(limit=meme_number)
    read_posts(posts, posts_dict)
            
    


In [82]:
# MAIN Program

posts_dict = {
    "ID" : [], "Source" : [], "Page" : [], "Title": [], "Author" : [], 
     "Score": [], "Comments": [], "Date" : [], "Time" : [], "Image URL": []
}

simple_subreddits = ["ukrainememes", "ukraine22memes", "RussiaUkraineWarMemes", 
                     "UkrainianMemes", "uamemesforces", "UkraineMem", "ukrmemes"]

# Reading data from every subreddit
for subreddit in simple_subreddits:
    simple_read(subreddit, posts_dict)


# The returned dataframe
dataframe = get_dataframe(posts_dict) 

# Removing duplicate memes
dataframe = dataframe.drop_duplicates(subset=["Title"], keep="first") 
dataframe = dataframe.drop_duplicates(subset=["Image URL"], keep="first") 

posts_dict = dataframe.to_dict()
display(dataframe)

# Writing the data to Excel
dataframe.to_excel("simple_subreddits.xlsx")

# ukrainememes & UkrainianMemes seem to have more than 1000 entries -> PushShift
# SIMPLE DATA: source (reddit, subreddit) + more detailed data





|                                                  | 891/? [01:10<00:00, 12.72it/s]                                     
|                                                  | 358/? [00:36<00:00,  9.87it/s]                                     
|                                                  | 175/? [00:11<00:00, 14.70it/s]                                     
|                                                  | 993/? [01:41<00:00,  9.79it/s]                                     
|                                                  | 499/? [00:45<00:00, 11.08it/s]                                     
|                                                  | 35/? [00:00<00:00, 59.79it/s]                                      
|                                                  | 246/? [00:23<00:00, 10.58it/s]                                     


Unnamed: 0,ID,Source,Page,Title,Author,Score,Comments,Date,Time,Image URL
0,tabt2e,Reddit,ukrainememes,rubles,silitbang6000,170,1,03/09/2022,11:39:52,https://i.redd.it/vxpbivt3zdm81.png
1,t9tkjm,Reddit,ukrainememes,You're Grounded,purplewhiteblack,127,2,03/08/2022,18:11:33,https://i.redd.it/jwc6d7r3s8m81.png
2,taav8h,Reddit,ukrainememes,"'Z' is for Zelensky - The Times (London), Peter Brookes (3/9/2022)",,103,0,03/09/2022,10:57:44,https://i.redd.it/rsrdgz9sqdm81.jpg
3,t7dog3,Reddit,ukrainememes,"Impact crater of Zelensky's balls, including visitors center, Cir: 2022",Tripledtities,102,0,03/05/2022,11:57:37,https://i.redd.it/r11f2gjoill81.jpg
4,xv93ln,Reddit,ukrainememes,Are we NATO yet?,arcade_b1t,92,3,10/04/2022,04:05:56,https://i.redd.it/gio0seawxqr91.png
...,...,...,...,...,...,...,...,...,...,...
2623,cw9eer,Reddit,ukrmemes,Комуняки йобані,kate_yefim,0,1,08/27/2019,15:39:22,https://i.redd.it/64m4jrs0o1j31.jpg
2624,bcdgjb,Reddit,ukrmemes,за Порошенка блядь,valeg,0,6,04/12/2019,09:17:29,https://i.redd.it/4amtwrav2ur21.jpg
2626,bbt3dh,Reddit,ukrmemes,🤛,valeg,0,0,04/10/2019,19:36:41,https://i.redd.it/g8veepk9vir21.jpg
2627,bfg0bd,Reddit,ukrmemes,сірники — не іграшка,valeg,0,0,04/20/2019,15:24:20,https://i.redd.it/ecbudzz3ygt21.jpg


In [None]:
#https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/
# Get all submissions from a subreddit with PushShift! You can go beyond the 1000 submissions limit.