### Step 1 - Start Spark Session

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize a Spark session
spark = SparkSession.builder.appName("ConstructorsIngestion").getOrCreate()

### Step 2 - Read the JSON file sing the Spark Dataframe Reader

In [10]:
#constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

In [11]:
constructors_schema = StructType(fields=[
    StructField("constructorId", IntegerType(), False),
    StructField("constructorRef", StringType(), True),
    StructField("name", StringType(), True),
    StructField("url", StringType(), True)
])

In [12]:
constructor_df = spark\
    .read\
    .schema(constructors_schema)\
    .json("../data/constructors.json")

### Step 3 - Drop unwanted columns from Dataframe

In [13]:
constructor_df = constructor_df.drop(constructor_df["url"])

### Step 4 - Rename columns and add ingestion data

In [14]:
constructor_df = constructor_df\
    .withColumnRenamed("contructorId", "constructor_id")\
    .withColumnRenamed("constructorRef", "constructor_ref")\
    .withColumn("ingestion_date", current_timestamp())

### Step 5 - Write output to a parquet file

In [15]:
constructor_df.write.mode("overwrite").parquet("../processed_data/constructors")

                                                                                