In [1]:
import praw
import pandas as pd
from datetime import datetime
import hashlib
import os
from praw.models import MoreComments
import time

In [2]:
reddit = praw.Reddit(
    client_id=os.environ["CLIENT_ID"],
    client_secret=os.environ["CLIENT_SECRET"],
    password=os.environ["PASSWORD"],
    user_agent="Live Thread Scraper by UkraineNewsBot",
    username="UkraineNewsBot",
)

In [6]:
threads = pd.read_csv("threads.csv")
threads.head()

Unnamed: 0,id,name,author,title,created_utc,created_at,num_comments,score,upvote_ratio,permalink
0,ssn7ic,t3_ssn7ic,09cb3c204828ede2196452cf1fe87c59,r/WorldNews Live Thread: Ukraine-Russia Tensions,1644878000.0,2022-02-14 22:35:01,6827,2211,0.92,/r/worldnews/comments/ssn7ic/rworldnews_live_t...
1,st8lq0,t3_st8lq0,6fb4eee62d702c106897cbed2f56feea,r/worldnews Live Thread: Ukraine-Russia Tensions,1644947000.0,2022-02-15 17:43:15,8958,3599,0.92,/r/worldnews/comments/st8lq0/rworldnews_live_t...
2,stgev6,t3_stgev6,be542cdc9cfecd8bf1d2320a8639c765,r/worldnews Live Thread: Ukraine-Russia Tensions,1644967000.0,2022-02-15 23:17:31,10880,4510,0.91,/r/worldnews/comments/stgev6/rworldnews_live_t...
3,stmu2a,t3_stmu2a,6fb4eee62d702c106897cbed2f56feea,r/worldnews Live Thread: Ukraine-Russia Tensions,1644986000.0,2022-02-16 04:25:27,6554,1644,0.87,/r/worldnews/comments/stmu2a/rworldnews_live_t...
4,su6qp2,t3_su6qp2,1af0f19d29e41c3c9ef40e57f14c5dd9,r/Worldnews Live Thread: Ukraine-Russia Tensions,1645046000.0,2022-02-16 21:19:58,5446,1044,0.88,/r/worldnews/comments/su6qp2/rworldnews_live_t...


In [7]:
comment_props = [
    "id", "body", "edited", "created_utc", 
    "link_id", "parent_id", "distinguished", 
    "depth", "ups", "downs", "score",
    "total_awards_received", "gilded",
]

def hash_string(content):
    return hashlib.md5(content.encode()).hexdigest()
    
def extract_comment(comment, submission_id):
    if comment.author:
        cmmt = [hash_string(comment.author.name), submission_id]
    else:
        cmmt = [None, submission_id]
        
    cmmt.extend([getattr(comment, prop) for prop in comment_props])

    
    if comment.gildings:
        gildings = str(comment.gildings)
    else:
        gildings = None

    cmmt.append(gildings)

    return cmmt

In [8]:
# raw_cmt = []
for idx, row in threads.iterrows():
    comments = []
    submission_id = row["id"]
    file_name = f'data/comments/comments__{submission_id}.csv'
    if os.path.exists(file_name):
        # print(f"Skipping {submission_id}")
        continue
    
    t0 = time.time()
    
    dt_object = datetime.fromtimestamp(t0)
    date_date = dt_object.strftime("%m/%d/%Y, %H:%M:%S")
    
    print(f"Processing {submission_id} – {date_date}")

    submission = reddit.submission(id=submission_id)
    submission.comments.replace_more(limit=LIMIT) 
    comment_queue = submission.comments[:]
    while comment_queue:
        comment = comment_queue.pop(0)
        comments.append(extract_comment(comment, submission_id))
        comment_queue.extend(comment.replies)
        
    frame = pd.DataFrame(comments, columns=["author", "submission_id"] + comment_props + ["gildings"])
    frame.to_csv(file_name, index=False)
    
    t1 = time.time()
    print(f"Done processing {len(comments)} comments from {submission_id}: {t1-t0}")

Processing t1lgcc – 04/24/2022, 23:12:08
Done processing 11935 comments from t1lgcc: 1817.3498861789703
Processing t1n65b – 04/24/2022, 23:42:25
Done processing 10542 comments from t1n65b: 1810.985468864441
Processing t1oqrc – 04/25/2022, 00:12:36
Done processing 11391 comments from t1oqrc: 1878.4541108608246
Processing t28zy8 – 04/25/2022, 00:43:54
Done processing 14737 comments from t28zy8: 2330.19105386734
Processing t2sxht – 04/25/2022, 01:22:45
Done processing 10807 comments from t2sxht: 1635.9001648426056
Processing t2z5rd – 04/25/2022, 01:50:01
Done processing 10608 comments from t2z5rd: 1663.67813205719
Processing t3105o – 04/25/2022, 02:17:44
Done processing 9799 comments from t3105o: 1525.0486159324646
