In [140]:
import praw
import pandas as pd
import datetime

import xlsxwriter

import requests
import urllib
from tqdm import tqdm
import time
import os
from pmaw import PushshiftAPI


In [141]:
# Adds posts from "post_list" into "posts_dict"

def read_posts(post_list, posts_dict):
    t = tqdm(bar_format='{l_bar}{bar:50}{r_bar}{bar:-1b}')
    with t as progress_bar:
        for post in post_list:

            if (".jpg" in post.url or ".png" in post.url or ".tiff" in post.url or ".gif" in post.url):
                posts_dict["Image URL"].append(str(post.url))
            else:
                progress_bar.update()
                continue 

            # Title of each post
            posts_dict["Title"].append(post.title)

            # Author of each post
            posts_dict["Author"].append(str(post.author))

            # Time post was created
            posts_dict["Timestamp"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%m/%d/%Y, %H:%M:%S"))

            # Text inside a post
            posts_dict["Post Text"].append(post.selftext)

            # Unique ID of each post
            posts_dict["ID"].append(post.id)

            # The score of a post
            posts_dict["Score"].append(post.score)

            # Total number of comments inside the post
            posts_dict["Total Comments"].append(str(post.num_comments))
            
            progress_bar.update()
        

In [142]:
# Reads memes from a single subreddit and writes data to "posts_dict"

def read_subreddit(subreddit, posts_dict, chunk_size, reps, use_keywords):

    api = PushshiftAPI()

    # Iterating over the returned list of submissions
    for x in range(reps):
        if (x == 0):
            last_time = int(time.time())
        else:
            if (len(post_list) != 0):
                last_time = post_list[-1]["created_utc"]
            else:
                break

        # Deciding whether to apply keyword filters 
        if (use_keywords):
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                q="russia|lugansk|luhansk|donetsk|ukraine|pushilin|kyiv|kiev|donbass|kharkiv|odesa|dnipro|zaporizhzhia|lviv|crimea|kryvyi|mykolaiv|sebastopol|sevastopol|mariupol|luhansk|vinnytsia|makiivka|simferopol|chernihiv|kherson|poltava|lutsk|oblast|dnipropetrovsk|zhitomir|rivne|sumy|ternopil|volin|zaporizhzhya|azov|zelensky|shmyhal|kuleba|reznikov|rada|pasechnik|yanukovych|poroshenko|yushchenko|dnieper|dniester|chernobyl|dpr|lpr|nato|dnr|lnr|maidan",
                sort_type="created_utc"
            )
            print("using keywords")
        else:
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                sort_type="created_utc"
            )
            print("not using keywords")
        post_list = [post for post in posts]
        print(len(post_list))

        # Writing data to "posts_dict"
        read_posts(post_list, posts_dict)


    

In [143]:
# Converts a dictionary into a Dataframe

def get_dataframe(posts_dict):
    pd.set_option('display.max_rows', 20)
    pd.set_option('max_colwidth', None)

    top_posts = pd.DataFrame(posts_dict)

    return top_posts

In [144]:
# Reads all the memes in a subreddit using PRAW (limited to 1000 memes per subreddit)

def simple_read(subreddit_name, posts_dict):
    
    # Login for Reddit PRAW
    reddit_read_only = praw.Reddit(
        client_id='JzbofoB1-Xrh3gbv--LVTQ',
        client_secret='v3R81WmJnSJJ8WWdjn4sZvRcerIVGg',
        user_agent='Ukraine Scraper'
    )
        
    meme_number = 1000 # max 1000
    
    subreddit = reddit_read_only.subreddit(subreddit_name)
    
#     posts = subreddit.top(time_filter='year', limit=meme_number)
    posts = subreddit.top(limit=meme_number)
    read_posts(posts, posts_dict)
            
    


In [145]:
# MAIN Program

posts_dict = {
    "Title": [], "Author" : [], "Timestamp" : [], "Post Text": [], 
    "ID": [], "Score": [], "Total Comments": [], "Image URL": []
}

simple_subreddits = ["ukrainememes", "ukraine22memes", "RussiaUkraineWarMemes", 
                     "UkrainianMemes", "uamemesforces", "UkraineMem", "ukrmemes"]

# Reading data from every subreddit
for subreddit in simple_subreddits:
    simple_read(subreddit, posts_dict)


# The returned dataframe
dataframe = get_dataframe(posts_dict) 

# Removing duplicate memes
dataframe = dataframe.drop_duplicates(subset=["Title"], keep="first") 
dataframe = dataframe.drop_duplicates(subset=["Image URL"], keep="first") 

posts_dict = dataframe.to_dict()
display(dataframe)

# Writing the data to Excel
dataframe.to_excel("simple_subreddits.xlsx")

# ukrainememes & UkrainianMemes seem to have more than 1000 entries -> PushShift
# SIMPLE DATA: source (reddit, subreddit) + more detailed data





|                                                  | 892/? [01:29<00:00,  9.93it/s]                                     
|                                                  | 358/? [00:36<00:00,  9.93it/s]                                     
|                                                  | 174/? [00:12<00:00, 14.27it/s]                                     
|                                                  | 993/? [00:41<00:00, 23.66it/s]                                     
|                                                  | 499/? [00:05<00:00, 97.45it/s]                                     
|                                                  | 35/? [00:00<00:00, 65.34it/s]                                      
|                                                  | 246/? [00:02<00:00, 93.66it/s]                                     


Unnamed: 0,Title,Author,Timestamp,Post Text,ID,Score,Total Comments,Image URL
0,rubles,silitbang6000,"03/09/2022, 11:39:52",,tabt2e,171,1,https://i.redd.it/vxpbivt3zdm81.png
1,You're Grounded,purplewhiteblack,"03/08/2022, 18:11:33",,t9tkjm,124,2,https://i.redd.it/jwc6d7r3s8m81.png
2,"'Z' is for Zelensky - The Times (London), Peter Brookes (3/9/2022)",,"03/09/2022, 10:57:44",,taav8h,103,0,https://i.redd.it/rsrdgz9sqdm81.jpg
3,"Impact crater of Zelensky's balls, including visitors center, Cir: 2022",Tripledtities,"03/05/2022, 11:57:37",,t7dog3,104,0,https://i.redd.it/r11f2gjoill81.jpg
4,Are we NATO yet?,arcade_b1t,"10/04/2022, 04:05:56",,xv93ln,93,3,https://i.redd.it/gio0seawxqr91.png
...,...,...,...,...,...,...,...,...
2624,Комуняки йобані,kate_yefim,"08/27/2019, 15:39:22",,cw9eer,0,1,https://i.redd.it/64m4jrs0o1j31.jpg
2625,за Порошенка блядь,valeg,"04/12/2019, 09:17:29",,bcdgjb,0,6,https://i.redd.it/4amtwrav2ur21.jpg
2627,🤛,valeg,"04/10/2019, 19:36:41",,bbt3dh,0,0,https://i.redd.it/g8veepk9vir21.jpg
2628,сірники — не іграшка,valeg,"04/20/2019, 15:24:20",,bfg0bd,0,0,https://i.redd.it/ecbudzz3ygt21.jpg


In [None]:
#https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/
# Get all submissions from a subreddit with PushShift! You can go beyond the 1000 submissions limit.