In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, BooleanType, DoubleType

post_schema = StructType(
    [
        StructField("id", StringType()),
        StructField("reddit_id", StringType()),
        StructField("subreddit", StringType()),
        StructField("author", StringType()),
        StructField("title", StringType()),
        StructField("body_text", StringType()),
        StructField("score", LongType()),
        StructField("number_of_comments", LongType()),
        StructField("is_text_post", BooleanType()),
        StructField("subreddit_category", StringType()),
        StructField("upvote_ratio", DoubleType()),
        StructField("reddit_post_url", StringType()),
        StructField("published_at", TimestampType()),
        StructField("article_id", StringType()),
        StructField("article_url", StringType()),
        StructField("article_headline", StringType()),
        StructField("article_author", StringType()),
        StructField("article_publisher", StringType()),
        StructField("article_content", StringType()),
        StructField("article_published_at", TimestampType()),
        StructField("article_category", StringType()),
        StructField("article_headline_cleaned", StringType()),
        StructField("article_content_cleaned", StringType()),
    ]
)

In [0]:
s3_path = dbutils.widgets.get("s3_path")
# post_path = "s3://amzn-s3-stock-prediction/posts_data/ingestion_date=2025-11-17/posts_2025-11-17.parquet"

data_frame = spark.read.schema(post_schema).parquet(s3_path, index=False) 
# data_frame.describe().show()
# data_frame.printSchema()


In [0]:
path_table="stock_prediction.default"
table_name="posts"
data_frame.createOrReplaceTempView("source_posts_temp")

In [0]:
%sql
MERGE INTO stock_prediction.default.articles AS target
USING source_posts_temp AS source
ON target.url = source.article_url

WHEN MATCHED THEN
  UPDATE SET
    target.id = source.article_id

WHEN NOT MATCHED THEN
  INSERT (
    `id`,
    `author`,
    `title`,
    `content`,
    `title_cleaned`,
    `content_cleaned`,
    `sentiment_strategy`,
    `published_at`,
    `source_name`,
    `url`
  )
  VALUES (
    source.article_id,
    source.article_author,
    source.article_headline,
    source.article_content,
    source.article_headline_cleaned,
    source.article_content_cleaned,
    source.article_category,
    source.article_published_at,
    source.article_publisher,
    source.article_url
  )

In [0]:
%sql
MERGE INTO stock_prediction.default.reddit_posts AS target
USING source_posts_temp AS source
ON target.reddit_post_url = source.reddit_post_url

WHEN NOT MATCHED THEN
    INSERT (
        `id`,
        `article_id`,
        `reddit_id`,
        `subreddit`,
        `author`,
        `title`,
        `body_text`,
        `score`,
        `number_of_comments`,
        `is_text_post`,
        `subreddit_category`,
        `upvote_ratio`,
        `published_at`,
        `reddit_post_url`
    )
    VALUES (
        source.id,
        source.article_id,
        source.reddit_id,
        source.subreddit,
        source.author,
        source.title,
        source.body_text,
        source.score,
        source.number_of_comments,
        source.is_text_post,
        source.subreddit_category,
        source.upvote_ratio,
        source.published_at,
        source.reddit_post_url
    )