In [4]:
import praw
import pandas as pd
import datetime

import xlsxwriter

import requests
import urllib
from tqdm import tqdm
import time
import os
from pmaw import PushshiftAPI


In [5]:
# Adds posts from "post_list" into "posts_dict"

def read_posts(post_list, posts_dict):
    t = tqdm(bar_format='{l_bar}{bar:50}{r_bar}{bar:-1b}')
    with t as progress_bar:
        for post in post_list:

            if (".jpg" in post.url or ".png" in post.url or ".tiff" in post.url or ".gif" in post.url):
                posts_dict["Image URL"].append(str(post.url))
            else:
                progress_bar.update()
                continue 

            # Title of each post
            posts_dict["Title"].append(post.title)

            # Author of each post
            posts_dict["Author"].append(str(post.author))

            # Time and Date the post was created
            posts_dict["Date"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%d-%m-%Y"))
            posts_dict["Time"].append(datetime.datetime.fromtimestamp(post.created_utc).strftime("%H:%M:%S"))
            posts_dict["Unix Time"].append(post.created_utc)

            # Unique ID of each post
            posts_dict["ID"].append(post.id)

            # The score of a post
            posts_dict["Score"].append(post.score)

            # Total number of comments inside the post
            posts_dict["Comments"].append(str(post.num_comments))
            
            # Source data, same for all posts using this method
            posts_dict["Source"] = "Reddit"; 
            posts_dict["Page"].append(post.subreddit.display_name)
            
            progress_bar.update()
        

In [6]:
# Converts a dictionary into a Dataframe

def get_dataframe(posts_dict):
    pd.set_option('display.max_rows', 2000)
    pd.set_option('max_colwidth', None)

    top_posts = pd.DataFrame(posts_dict)

    return top_posts

In [7]:
# Reads all the memes in a subreddit using PRAW (limited to 1000 memes per subreddit)

def simple_read(subreddit_name, posts_dict):
    
    # Login for Reddit PRAW
    reddit_read_only = praw.Reddit(
        client_id='JzbofoB1-Xrh3gbv--LVTQ',
        client_secret='v3R81WmJnSJJ8WWdjn4sZvRcerIVGg',
        user_agent='Ukraine Scraper'
    )
        
    meme_number = 1000 # max 1000
    
    subreddit = reddit_read_only.subreddit(subreddit_name)
    
#     posts = subreddit.top(time_filter='year', limit=meme_number)
#     posts = subreddit.top(limit=meme_number)
    posts = subreddit.new(limit=meme_number)
    read_posts(posts, posts_dict)
            
    


In [8]:
# MAIN Program - [PRAW]

posts_dict = {
    "ID" : [], "Source" : [], "Page" : [], "Title": [], "Author" : [], 
     "Score": [], "Comments": [], "Date" : [], "Time" : [], "Image URL": [], "Unix Time": []
}

simple_subreddits = ["ukrainememes", "ukraine22memes", "RussiaUkraineWarMemes", 
                     "UkrainianMemes", "uamemesforces", "UkraineMem", "ukrmemes"]


# Reading data from every subreddit
for subreddit in simple_subreddits:
    simple_read(subreddit, posts_dict)


# The returned dataframe
dataframe = get_dataframe(posts_dict) 

# Removing duplicate memes
dataframe = dataframe.drop_duplicates(subset=["Title"], keep="first") 
dataframe = dataframe.drop_duplicates(subset=["Image URL"], keep="first") 

posts_dict = dataframe.to_dict()
display(dataframe)

# Writing the data to Excel
dataframe.to_excel("simple_subreddits3.xlsx")

# ukrainememes & UkrainianMemes seem to have more than 1000 entries -> PushShift
# SIMPLE DATA: source (reddit, subreddit) + more detailed data





|                                                  | 899/? [00:09<00:00, 94.86it/s]                                     
|                                                  | 357/? [00:06<00:00, 55.68it/s]                                     
|                                                  | 175/? [00:01<00:00, 99.04it/s]                                     
|                                                  | 976/? [00:13<00:00, 72.12it/s]                                     
|                                                  | 514/? [00:05<00:00, 100.25it/s]                                    
|                                                  | 35/? [00:00<00:00, 70.67it/s]                                      
|                                                  | 246/? [00:02<00:00, 95.69it/s]                                     


Unnamed: 0,ID,Source,Page,Title,Author,Score,Comments,Date,Time,Image URL,Unix Time
0,10fu1a7,Reddit,ukrainememes,The most effective anti-tank weapons for Ukraine and Russia:,Whole_Speed3426,32,0,19-01-2023,00:54:49,https://i.redd.it/zg5jqweodzca1.jpg,1.674108e+09
1,10f3knf,Reddit,ukrainememes,"In 2022 Fischer Random World Chess Championship, Iranian arbiter Shohreh Bayat wore a t-shirt that wrote “women, life, freedom” and was asked by FIDE President Arkady Dvorkovich (a Russian politician) to change it and not to mix politics. The next day she changed it to Ukrainian flag styled clothes.",Blakut,26,0,18-01-2023,04:55:27,https://i.redd.it/28y4j49fdnca1.jpg,1.674036e+09
2,10eoedj,Reddit,ukrainememes,What if the US actually delivered A-10's to Ukraine,Loki16082,27,2,17-01-2023,16:39:26,https://i.redd.it/7gshktndspca1.jpg,1.673992e+09
3,10duy45,Reddit,ukrainememes,That look...,m0ebel,31,1,16-01-2023,17:43:16,https://i.redd.it/zwdovksahhca1.jpg,1.673909e+09
4,10d9vk5,Reddit,ukrainememes,"Sorry, I'm dead inside, no fun captions today...",arcade_b1t,44,3,16-01-2023,04:05:18,https://i.redd.it/alykq0aafdca1.png,1.673860e+09
...,...,...,...,...,...,...,...,...,...,...,...
2575,aqo1mz,Reddit,ukrmemes,SADO-MAN,valeg,4,0,14-02-2019,15:12:56,https://i.redd.it/pfjao8m3dlg21.jpg,1.550175e+09
2576,aqo0xu,Reddit,ukrmemes,Так!!1,valeg,2,1,14-02-2019,15:11:13,https://i.redd.it/u30wj6ptclg21.jpg,1.550175e+09
2577,aqo03u,Reddit,ukrmemes,Валентинка,valeg,14,0,14-02-2019,15:09:06,https://i.redd.it/j22p02nfclg21.png,1.550175e+09
2578,aqnyyt,Reddit,ukrmemes,Здорове харчування,valeg,14,0,14-02-2019,15:06:01,https://i.redd.it/emyqvxzvblg21.png,1.550175e+09


In [10]:
def pmaw_post_to_dict(posts_dict, post):
    
        if (".jpg" in post["url"] or ".png" in post["url"] or ".tiff" in post["url"] or ".gif" in post["url"]):
            posts_dict["Image URL"].append(str(post["url"]))
        else:
            return

        posts_dict["Title"].append(post["title"])

        # Author of each post
        posts_dict["Author"].append(str(post["author"]))

        # Time and Date the post was created
        posts_dict["Date"].append(datetime.datetime.fromtimestamp(post["created_utc"]).strftime("%d/%m/%Y"))
        posts_dict["Time"].append(datetime.datetime.fromtimestamp(post["created_utc"]).strftime("%H:%M:%S"))
        posts_dict["Unix Time"].append(post["created_utc"])

        # Unique ID of each post
        posts_dict["ID"].append(post["id"])

        # The score of a post
        posts_dict["Score"].append(post["score"])

        # Total number of comments inside the post
        posts_dict["Comments"].append(str(post["num_comments"]))

        # Source data, same for all posts using this method
        posts_dict["Source"] = "Reddit"; 
#         posts_dict["Page"].append(post["subreddit"])
        posts_dict["Page"].append("undecided")
        # ERROR ABOVE PRAW GET SUBREDDIT FROM PAGE (CHECK MAIN.IPYNB)
    

In [11]:
def read_subreddit_pmaw(subreddit):
    api = PushshiftAPI()

    posts_dict = {
        "ID" : [], "Source" : [], "Page" : [], "Title": [], "Author" : [], 
         "Score": [], "Comments": [], "Date" : [], "Time" : [], "Image URL": [], "Unix Time" : []
    }

    posts = api.search_submissions(
        subreddit=subreddit, 
        sort="desc",
        sort_type="created_utc",
        limit=100
    )

    post_list = list(posts)

    # Requests
    for post in post_list:
        pmaw_post_to_dict(posts_dict, post) # fix image filtering


    dataframe = get_dataframe(posts_dict) 
    dataframe.sort_values(by=["Unix Time"])

    while (len(post_list) > 0):
        last_time = int(dataframe["Unix Time"].iloc[-1])

        posts = api.search_submissions(
            subreddit=subreddit, 
            sort="desc",
            sort_type="created_utc",
            before=last_time,
            limit=100
        )

        post_list = list(posts)

        # Requests
        for post in post_list:
            pmaw_post_to_dict(posts_dict, post) # fix image filtering


        dataframe = get_dataframe(posts_dict) 
        dataframe.sort_values(by=["Unix Time"])

    display(dataframe)




In [12]:
read_subreddit_pmaw("ukrainememes")


Unnamed: 0,ID,Source,Page,Title,Author,Score,Comments,Date,Time,Image URL,Unix Time
0,yp01pj,Reddit,undecided,Putin beneath the sheets,GeorgesJokes,1,1,07/11/2022,15:54:48,https://i.redd.it/n2n2nj7ydly91.jpg,1667854488
1,yoyhzy,Reddit,undecided,Maybe they'll get a Model with Airbags,Ok_ok6160,1,0,07/11/2022,15:01:28,https://i.redd.it/lhd7ngm2tky91.gif,1667851288
2,yogesb,Reddit,undecided,Saving this for when Ukraine wins ...,Paixdieu,1,0,07/11/2022,03:02:29,https://i.redd.it/9wggn9makhy91.png,1667808149
3,yo4od4,Reddit,undecided,Is this how i pull?,hollowmartin,1,0,06/11/2022,17:18:31,https://i.redd.it/qwld6317oey91.jpg,1667773111
4,yo3efk,Reddit,undecided,"Think twice, Satan",Ok_ok6160,1,0,06/11/2022,16:28:19,https://i.redd.it/yv2kl874dey91.png,1667770099
5,yn21al,Reddit,undecided,"orc, non-binary",Ok_ok6160,1,0,05/11/2022,14:26:02,https://i.redd.it/4lrcdysfd6y91.png,1667672762
6,ymxlcf,Reddit,undecided,Never I would have imagined to one day agree with him in anything,emdivi_pt,1,0,05/11/2022,11:43:56,https://i.redd.it/op69ngue27y91.jpg,1667663036
7,ym8poc,Reddit,undecided,... as long as they can carry,Ok_ok6160,1,0,04/11/2022,15:58:23,https://i.redd.it/1g7xuen4pzx91.png,1667591903
8,ym8kg8,Reddit,undecided,No Machine left Behind,Ok_ok6160,1,0,04/11/2022,15:52:22,https://i.redd.it/cvsb2m0aozx91.png,1667591542
9,yldlv0,Reddit,undecided,😁,pastebluepaste,1,0,03/11/2022,16:19:02,https://i.redd.it/9vjsame5osx91.jpg,1667506742


In [13]:
#https://www.osrsbox.com/blog/2019/03/18/watercooler-scraping-an-entire-subreddit-2007scape/
# Get all submissions from a subreddit with PushShift! You can go beyond the 1000 submissions limit.

In [14]:
# It would be good to find one large source of memes (2k+) instead of gluing many sources together...
# Twitter / Instagram / Know your meme...?

In [16]:
# create a simple request and compare it to the pushshift website...