In [35]:
import praw
import pandas as pd
import datetime

import xlsxwriter

import requests
import urllib
from tqdm import tqdm
import time
import os
from pmaw import PushshiftAPI


In [1]:
# Writes "posts_dict" to an Excel file called "name"

def write_data(posts_dict, name):
    
    # Creating the worksheet
    workbook = xlsxwriter.Workbook(name)
    worksheet = workbook.add_worksheet()

    # Writing the table headers
    i = 0
    for key in posts_dict:
        worksheet.write(0, i, key)
        i+=1

    # Writing the table data
    for x in range(1, len(posts_dict["Title"])):
        i = 0
        for key in posts_dict:
            if (key == "Image URL"):
                worksheet.write_url(x, i, posts_dict[key][x])
            else:
                worksheet.write(x, i, posts_dict[key][x])
            i+=1

    # Saving the document
    workbook.close()

In [56]:
# Adds posts from "post_list" into "posts_dict"

def read_posts(post_list, posts_dict):
    t = tqdm(bar_format='{l_bar}{bar:50}{r_bar}{bar:-1b}')
    with t as progress_bar:
        for post in post_list:

            if (".jpg" in post.url or ".png" in post.url or ".tiff" in post.url or ".gif" in post.url):
                posts_dict["Image URL"].append(str(post.url))
            else:
                progress_bar.update()
                continue 

            # Title of each post
            posts_dict["Title"].append(post.title)

            # Author of each post
            posts_dict["Author"].append(str(post.author))

            # Time post was created
            posts_dict["Timestamp"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%m/%d/%Y, %H:%M:%S"))

            # Text inside a post
            posts_dict["Post Text"].append(post.selftext)

            # Unique ID of each post
            posts_dict["ID"].append(post.id)

            # The score of a post
            posts_dict["Score"].append(post.score)

            # Total number of comments inside the post
            posts_dict["Total Comments"].append(str(post.num_comments))
            
            progress_bar.update()
        

In [38]:
# Reads memes from a single subreddit and writes data to "posts_dict"

def read_subreddit(subreddit, posts_dict, chunk_size, reps, use_keywords):

    api = PushshiftAPI()

    # Iterating over the returned list of submissions
    for x in range(reps):
        if (x == 0):
            last_time = int(time.time())
        else:
            if (len(post_list) != 0):
                last_time = post_list[-1]["created_utc"]
            else:
                break

        # Deciding whether to apply keyword filters 
        if (use_keywords):
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                q="russia|lugansk|luhansk|donetsk|ukraine|pushilin|kyiv|kiev|donbass|kharkiv|odesa|dnipro|zaporizhzhia|lviv|crimea|kryvyi|mykolaiv|sebastopol|sevastopol|mariupol|luhansk|vinnytsia|makiivka|simferopol|chernihiv|kherson|poltava|lutsk|oblast|dnipropetrovsk|zhitomir|rivne|sumy|ternopil|volin|zaporizhzhya|azov|zelensky|shmyhal|kuleba|reznikov|rada|pasechnik|yanukovych|poroshenko|yushchenko|dnieper|dniester|chernobyl|dpr|lpr|nato|dnr|lnr|maidan",
                sort_type="created_utc"
            )
            print("using keywords")
        else:
            posts = api.search_submissions(
                subreddit=subreddit, 
                limit=chunk_size,
                before=last_time,
                sort="desc",
                sort_type="created_utc"
            )
            print("not using keywords")
        post_list = [post for post in posts]
        print(len(post_list))

        # Writing data to "posts_dict"
        read_posts(post_list, posts_dict)



    

In [39]:
def display_posts_dict(posts_dict):
    pd.set_option('display.max_rows', 1000)
    pd.set_option('max_colwidth', None)

    top_posts = pd.DataFrame(posts_dict)
    display(top_posts)

In [46]:
# Reads all the memes in a subreddit using PRAW (limited to 1000 memes)
def simple_read(subreddit_name, posts_dict):
    
    # Login for Reddit PRAW
    reddit_read_only = praw.Reddit(
        client_id='JzbofoB1-Xrh3gbv--LVTQ',
        client_secret='v3R81WmJnSJJ8WWdjn4sZvRcerIVGg',
        user_agent='Ukraine Scraper'
    )
        
    meme_number = 1000 # limited to 1000
    
    subreddit = reddit_read_only.subreddit(subreddit_name)
    
#     posts = subreddit.top(time_filter='year', limit=meme_number)
    posts = subreddit.top(limit=meme_number)
        
#     with t as progress_bar:
    read_posts(posts, posts_dict)
            
    


In [62]:


posts_dict = {
    "Title": [], "Author" : [], "Timestamp" : [], "Post Text": [], 
    "ID": [], "Score": [], "Total Comments": [], "Image URL": []
}

simple_subreddits = ["ukrainememes", "ukraine22memes", "RussiaUkraineWarMemes", 
                     "UkrainianMemes", "uamemesforces", "UkraineMem", "ukrmemes"]

# read_subreddit("ukrainememes", posts_dict, 100, 3, [])
# read_subreddit("ukraine22memes", posts_dict, 100, 1, [])


# simple_read("ukrainememes", posts_dict)
# simple_read("ukraine22memes", posts_dict)
# simple_read("UkrainianMemes", posts_dict)

for subreddit in simple_subreddits:
    simple_read(subreddit, posts_dict)


display_posts_dict(posts_dict)

# ukrainememes & UkrainianMemes seem to have more than 1000 entries -> PushShift






|                                                  | 888/? [00:10<00:00, 81.50it/s]                                     
|                                                  | 358/? [00:06<00:00, 51.72it/s]                                     
|                                                  | 174/? [00:01<00:00, 96.50it/s]                                     
|                                                  | 993/? [00:12<00:00, 82.15it/s]                                     
|                                                  | 499/? [00:05<00:00, 97.58it/s]                                     
|                                                  | 35/? [00:00<00:00, 70.26it/s]                                      


Unnamed: 0,Title,Author,Timestamp,Post Text,ID,Score,Total Comments,Image URL
0,rubles,silitbang6000,"03/09/2022, 11:39:52",,tabt2e,172,1,https://i.redd.it/vxpbivt3zdm81.png
1,You're Grounded,purplewhiteblack,"03/08/2022, 18:11:33",,t9tkjm,123,2,https://i.redd.it/jwc6d7r3s8m81.png
2,"'Z' is for Zelensky - The Times (London), Peter Brookes (3/9/2022)",,"03/09/2022, 10:57:44",,taav8h,103,0,https://i.redd.it/rsrdgz9sqdm81.jpg
3,"Impact crater of Zelensky's balls, including visitors center, Cir: 2022",Tripledtities,"03/05/2022, 11:57:37",,t7dog3,101,0,https://i.redd.it/r11f2gjoill81.jpg
4,Are we NATO yet?,arcade_b1t,"10/04/2022, 04:05:56",,xv93ln,95,3,https://i.redd.it/gio0seawxqr91.png
...,...,...,...,...,...,...,...,...
2400,eeee,,"10/21/2022, 18:12:00",,ya6b7n,1,0,https://i.redd.it/ysb8cxxfg8v91.jpg
2401,UK,,"10/21/2022, 18:09:18",,ya690c,1,0,https://i.redd.it/573s2tmxf8v91.jpg
2402,Slava Ukraine,,"10/20/2022, 12:25:09",,y92r6x,2,0,https://i.redd.it/am7n5dkmlzu91.jpg
2403,Русские иди в Ад,Saunders8726,"09/21/2022, 08:05:40",,xk3248,1,0,https://i.redd.it/qn8jza9tc7p91.jpg
