##### 1. Installing Required Dependencies



In [1]:
!pip install praw

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 3, Finished, Available, Finished)

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


##### 2. Importing Required Libraries

In [2]:
import praw
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 4, Finished, Available, Finished)

##### 3. Initialize Spark and Spark session

In [3]:
def init_fabric_spark():
    spark = SparkSession.builder \
        .appName("Reddit Data Pipeline") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.hadoop.fs.defaultFS", "abfss://APIDataLakehouse@onelake.dfs.fabric.microsoft.com") \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.cores", "2") \
        .config("spark.num.executors", "2") \
        .getOrCreate()

    return spark

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 5, Finished, Available, Finished)


##### 4. Data Ingestion - Bronze Layer

In [4]:
def define_schema():
    """
    Defines the schema for the Spark DataFrame.
    """
    return StructType([
        StructField("Submission_Fct_id", StringType(), True), 
        StructField("title", StringType(), True),
        StructField("created_utc", FloatType(), True), 
        StructField("author_name", StringType(), True),
        StructField("score", IntegerType(), True), 
        StructField("upvote_ratio", FloatType(), True),
        StructField("num_comments", IntegerType(), True), 
        StructField("over_18", BooleanType(), True),
        StructField("spoiler", BooleanType(), True), 
        StructField("link_flair_text", StringType(), True),
        StructField("stickied", BooleanType(), True), 
        StructField("Total_Awards_Received", IntegerType(), True),
        StructField("Gilded_Count", IntegerType(), True), 
        StructField("Number_of_Crossposts", IntegerType(), True),
        StructField("url", StringType(), True)
    ])

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 6, Finished, Available, Finished)

In [5]:
def extract_reddit_data(client_id, client_secret, user_agent, subreddits, limit=100000):
    """
    Extracts data from Reddit API and returns a pandas DataFrame.
    """
    reddit = praw.Reddit(
        client_id=client_id, 
        client_secret=client_secret, 
        user_agent=user_agent, 
        ratelimit_seconds=10
    )

    # Combine data from different endpoints
    data = []
    
    # Get hot posts
    for submission in reddit.subreddit(subreddits).hot(limit=limit):
        data.append([
            submission.id, submission.title, submission.created_utc, 
            str(submission.author), submission.score, submission.upvote_ratio,
            submission.num_comments, submission.over_18, submission.spoiler, 
            submission.link_flair_text, submission.stickied,
            submission.total_awards_received, submission.gilded, 
            submission.num_crossposts, submission.url
        ])
    
    # Get new posts
    for submission in reddit.subreddit(subreddits).new(limit=limit):
        data.append([
            submission.id, submission.title, submission.created_utc, 
            str(submission.author), submission.score, submission.upvote_ratio,
            submission.num_comments, submission.over_18, submission.spoiler, 
            submission.link_flair_text, submission.stickied,
            submission.total_awards_received, submission.gilded, 
            submission.num_crossposts, submission.url
        ])
    
    # Get controversial posts
    for submission in reddit.subreddit(subreddits).controversial(limit=limit, time_filter="all"):
        data.append([
            submission.id, submission.title, submission.created_utc, 
            str(submission.author), submission.score, submission.upvote_ratio,
            submission.num_comments, submission.over_18, submission.spoiler, 
            submission.link_flair_text, submission.stickied,
            submission.total_awards_received, submission.gilded, 
            submission.num_crossposts, submission.url
        ])
    
    # Remove duplicates based on submission ID
    data = list({item[0]: item for item in data}.values())
    
    columns = ["Submission_Fct_id", "title", "created_utc", "author_name", "score", "upvote_ratio", "num_comments",
               "over_18", "spoiler", "link_flair_text", "stickied", "Total_Awards_Received",
               "Gilded_Count", "Number_of_Crossposts", "url"]
    
    # Define data types and create DataFrame
    return pd.DataFrame(data, columns=columns).astype({
        "Submission_Fct_id": "string", "title": "string", "created_utc": "float", "author_name": "string",
        "score": "int", "upvote_ratio": "float", "num_comments": "int", "over_18": "bool",
        "spoiler": "bool", "link_flair_text": "string", "stickied": "bool",
        "Total_Awards_Received": "int", "Gilded_Count": "int", "Number_of_Crossposts": "int", "url": "string"
    })

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 7, Finished, Available, Finished)

##### 5. Ingest into Data to Lakehouse

In [6]:
def upload_to_lakehouse(pdf, directory_path):
    """
    Save pandas DataFrame as CSV file to Fabric Lakehouse
    
    Args:
        pdf: pandas DataFrame to save
        directory_path: Destination path within the Files folder (e.g., "reddit_data/reddit_extract")
    """
    try:
        # Convert pandas DataFrame to Spark DataFrame
        spark_df = spark.createDataFrame(pdf)
        
        # ===== LAKEHOUSE CONFIGURATION =====
        # From your Properties screenshot:
        lakehouse_id = "e466565d-d48f-47c5-a36c-129e9706433f"  # Your Lakehouse ID
        workspace_name = "APIDataProcessing-DP"  # From your error message
        
        # ===== PATH CONSTRUCTION =====
        # Option 1: ABFS Path (most reliable)

        full_path = f"abfss://e466565d-d48f-47c5-a36c-129e9706433f@onelake.dfs.fabric.microsoft.com/05029953-93c2-40a0-b222-5656dc677253/Files/{directory_path}"
        
        # Option 2: Simplified path (works if Lakehouse is attached)
        # full_path = f"Files/{directory_path}"
        
        # ===== DIRECTORY CREATION =====
        parent_dir = "/".join(directory_path.split("/")[:-1])
        if parent_dir:
            mssparkutils.fs.mkdirs(f"Files/{parent_dir}")
        
        # ===== SAVE OPERATION =====
        spark_df.write \
            .format("csv") \
            .option("header", "true") \
            .mode("overwrite") \
            .save(full_path)
        
        # ===== VERIFY OUTPUT =====
        csv_files = mssparkutils.fs.ls(f"Files/{directory_path}")
        csv_file = next(f for f in csv_files if f.name.startswith("part-"))
        
        print(f"✅ CSV successfully saved to: {full_path}")
        print(f"📄 Actual file: {csv_file.path}")
        return csv_file.path
        
    except Exception as e:
        print(f"❌ Error saving to Lakehouse: {str(e)}")
        print("Troubleshooting Tips:")
        print("1. Verify Lakehouse is attached to notebook (top-right dropdown)")
        print("2. Check if directory_path exists: mssparkutils.fs.ls('Files/')")
        print(f"3. Test manual save: df.write.csv('Files/test_output')")
        raise

def Load_to_delta_table_metastore(spark_df, database_name, table_name):
    """
    Load data to a Delta table in the metastore, creating it if it doesn't exist
    """
    # Create database if it doesn't exist
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
    
    full_table_name = f"{database_name}.{table_name}"
    
    if spark.catalog.tableExists(full_table_name):
        print(f"Loading to existing table: {full_table_name}")
        spark_df.write.format("delta").mode("overwrite").saveAsTable(full_table_name)
    else:
        print(f"Creating new table: {full_table_name}")
        spark_df.write.format("delta").mode("overwrite").saveAsTable(full_table_name)    
    print(f"Data successfully written to {full_table_name}")
    return full_table_name

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 8, Finished, Available, Finished)

##### 5. Defining a helper function to run an Audit Trail

In [7]:
def log_audit_entry(action, status, details):
    """
    Logs an audit entry into a Delta table.
    """
    try:
        # Create a DataFrame for the audit log entry
        audit_data = [(action, status, details, datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"))]
        audit_schema = StructType([
            StructField("action", StringType(), True),
            StructField("status", StringType(), True),
            StructField("details", StringType(), True),
            StructField("timestamp", StringType(), True)
        ])
        
        audit_entry = spark.createDataFrame(audit_data, schema=audit_schema)
        
        # Database and table details
        database_name = "audit_layer"
        table_name = "etl_logs"
        full_table_name = f"{database_name}.{table_name}"
        
        # Ensure the database exists
        spark.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
        
        # Save the audit log
        if spark.catalog.tableExists(full_table_name):
            audit_entry.write.format("delta").mode("append").saveAsTable(full_table_name)
        else:
            audit_entry.write.format("delta").mode("overwrite").saveAsTable(full_table_name)
        
        print(f"Audit log entry added successfully to {full_table_name}.")
        return True

    except Exception as e:
        print(f"Failed to log audit entry: {e}")
        raise

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 9, Finished, Available, Finished)

##### 7. Execute Complete Bronze Layer 

###### **Extraction:**

In [8]:
try:
    
    # Extraction parameters
    client_id = "ABC"
    client_secret = "DEF"
    user_agent = "FahadHassan"
    subreddits = "funny+AskReddit+gaming+worldnews+todayilearned+aww+Music+memes+science+pics+Jokes+news+videos+space+askscience+DIY+books+food+mildlyinteresting+GetMotivated+explainlikeimfive+LifeProTips"
        
    # Extract data from Reddit API
    print("Extracting data from Reddit API...")
    pdf = extract_reddit_data(client_id, client_secret, user_agent, subreddits)
    print(f"Successfully extracted {len(pdf)} records from Reddit")

    # Log extraction action
    log_audit_entry("Extract", "Success", f"Extracted {pdf.count()} records from Reddit API")

except Exception as e:

        error_msg = f"Error in Bronze layer processing: {str(e)}"
        print(error_msg)
        log_audit_entry("BronzeError", "Failed", error_msg)
        raise

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 10, Finished, Available, Finished)

Extracting data from Reddit API...
Successfully extracted 38482 records from Reddit
Audit log entry added successfully to audit_layer.etl_logs.


###### **Ingesting in Lakehouse as .CSV**:

In [11]:
try:

    # Upload data to Lakehouse
    print("Uploading data to Lakehouse...")

    # Saving as .csv file in Lakehouse
    directory_path = "reddit_data/reddit_extract"
    lakehouse_path = f"abfss://e466565d-d48f-47c5-a36c-129e9706433f@onelake.dfs.fabric.microsoft.com/05029953-93c2-40a0-b222-5656dc677253/Files/{directory_path}"
    csv_file_path = upload_to_lakehouse(pdf, directory_path)

    # Log upload action
    log_audit_entry("Upload", "Success", f"Uploaded data to {lakehouse_path}")

except Exception as e:

    error_msg = f"Error in Bronze layer processing: {str(e)}"
    print(error_msg)
    log_audit_entry("BronzeError", "Failed", error_msg)
    raise

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 13, Finished, Available, Finished)

Uploading data to Lakehouse...
✅ CSV successfully saved to: abfss://e466565d-d48f-47c5-a36c-129e9706433f@onelake.dfs.fabric.microsoft.com/05029953-93c2-40a0-b222-5656dc677253/Files/reddit_data/reddit_extract
📄 Actual file: abfss://e466565d-d48f-47c5-a36c-129e9706433f@onelake.dfs.fabric.microsoft.com/05029953-93c2-40a0-b222-5656dc677253/Files/reddit_data/reddit_extract/part-00000-b454b279-0b86-4225-961b-19df95eed92f-c000.csv
Audit log entry added successfully to audit_layer.etl_logs.


###### **Ingesting Unique Data into Delta Lake:**

In [12]:
try:
    
    # Read data from Lakehouse into Spark DataFrame
    print("Reading data from Lakehouse...")
    schema = define_schema()

    spark_df = spark.read.format("csv") \
        .option("header", "true") \
        .schema(schema) \
        .option("quote", '\"') \
        .option("escape", '\"') \
        .option("multiline", "true") \
        .load(lakehouse_path)

    # Drop duplicates based on the primary key column 'Submission_Fct_id'
    spark_df = spark_df.dropDuplicates(["Submission_Fct_id"])
        
    # Save to Bronze layer Delta table
    print("Saving to Bronze layer...")
    bronze_table = Load_to_delta_table_metastore(spark_df, "bronze_layer", "reddit_extracted_data")
    
    # Log completion
    log_audit_entry("Bronze Load", "Success", f"Loaded data to {bronze_table}")
    print(f"Successfully completed Bronze layer processing. Data saved to {bronze_table}")
    print(f"Successfully loaded {spark_df.count()} records into Bronze layer")

except Exception as e:
    error_msg = f"Error in Bronze layer processing: {str(e)}"
    print(error_msg)
    log_audit_entry("BronzeError", "Failed", error_msg)
    raise

StatementMeta(, 375d0ff8-2ad2-4dab-8511-d5b5b83d1e99, 14, Finished, Available, Finished)

Reading data from Lakehouse...
Saving to Bronze layer...
Loading to existing table: bronze_layer.reddit_extracted_data
Data successfully written to bronze_layer.reddit_extracted_data
Audit log entry added successfully to audit_layer.etl_logs.
Successfully completed Bronze layer processing. Data saved to bronze_layer.reddit_extracted_data
Successfully loaded 38450 records into Bronze layer
