In [1]:
import os
spark_home = os.path.abspath(os.getcwd() + "/spark/spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/spark/winutils")
# print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    # print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    # print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()


In [2]:
raw_df = spark.read.json("data/raw")


In [3]:
raw_df.printSchema()


root
 |-- value: string (nullable = true)



In [4]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

# Define the schema for the nested JSON
paper_schema = StructType([
    StructField("aid", StringType()),
    StructField("title", StringType()),
    StructField("summary", StringType()),
    StructField("main_category", StringType()),
    StructField("categories", StringType()),
    StructField("published", StringType())
])


In [5]:
# Parse the nested JSON
papers_df = raw_df.withColumn("paper_data", 
                           from_json(col("value"), paper_schema))\
                 .select("paper_data.*")

# Display sample data
papers_df.show(5, truncate=False)


+---------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
# Count records
print(f"Total number of rows: {papers_df.count()}")


Total number of rows: 29094


In [7]:
# Drop duplicates based on all columns instead of just "aid"
papers_unique_df = papers_df.dropDuplicates(["aid"])
print(f"After removing duplicates based on all columns: {papers_unique_df.count()} papers remain")


After removing duplicates based on all columns: 8902 papers remain


In [8]:
papers_unique_df.write.mode("overwrite").json("data/interim")
