In [0]:
# %pip install praw
# %pip install dotenv

In [0]:
from dotenv import load_dotenv
import os
import praw
from praw.models import MoreComments
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType


# load environment variables from .env
load_dotenv() 
# get environmental variables
client_secret = os.getenv("REDDIT_SECRET") 
client_id = os.getenv("REDDIT_CLIENT_ID")
user_agent = os.getenv("REDDIT_USER_AGENT")


In [0]:
reddit_read_only = praw.Reddit(client_id = client_id, client_secret = client_secret, user_agent = user_agent, check_for_async=False)


In [0]:
query = '''
(food AND delivery) OR (delivery AND order) OR "delivery app" OR 배달앱 OR 배달어플 OR 배달
OR
"Coupang Eats" OR 쿠팡이츠 OR "쿠팡 이츠" OR "CoupangEats"
OR
Baemin OR 배민 OR "배달의 민족" OR "Baedal Minjok"
OR
Yogiyo OR "요기요"
OR
(Shuttle AND food) OR (Shuttle AND order) OR (Shuttle AND delivery) OR "Shuttle Delivery"
'''

# "korea+koreatravel+living_in_korea"
subreddit_to_search = ["korea", "koreatravel", "living_in_korea"]

search_results = []

# iterate through each subreddit
# reason: 
# if I set subreddit as "korea+koreatravel+living_in_korea", 
# its result is quite limited to only recent ones (only down to 2023), 
# due to the reddit's innate search restriction.
for subr in subreddit_to_search:
    # Post URL
    posts = reddit_read_only.subreddit(subr).search(query = query, sort="new", limit = 1000)
    print(f"-- {subr} --")
    for post in posts:
        # exclude non-reddit url
        if post.is_self:
            search_results.append(post.url)
            #created_time = datetime.fromtimestamp(post.created_utc)


In [0]:
contents = []
# get post content, date, and comments from each url
for url in search_results:
    post = reddit_read_only.submission(url=url)
    contents.append((url, post.selftext, post.created_utc))
    for comment in post.comments:
        # if comment is MoreComments object, skip this item. 
        # Otherwise, comment.body will fail
        if isinstance(comment, MoreComments): 
            continue
        #created_time = datetime.fromtimestamp(comment.created_utc)
        contents.append((url, comment.body, comment.created_utc))



In [0]:
spark = SparkSession.builder.appName("bronze_reddit_ingest").getOrCreate()

contents_schema = StructType([
    StructField("url", StringType(), False),
    StructField("content", StringType(), False),
    StructField("created_datetime", StringType(), False),
])

df = spark.createDataFrame(data = contents, schema = contents_schema)


In [0]:
df.write\
    .format("delta")\
    .mode("overwrite")\
    .saveAsTable("workspace.growth_poc.bronze_reddit")
