In [0]:
%run "../includes/configuration"

In [0]:
races_df = spark.read.parquet(f"{processed_folder_path}/races")

In [0]:
# Two approaches
races_filtered_df = races_df.filter("race_year = 2019")
races_filtered_df = races_df.filter(races_df["race_year"] == 2019)
display(races_filtered_df)

In [0]:
races_filtered_df = races_df.filter((races_df["race_year"] == 2019) & (races_df["round"] <= 5))
display(races_filtered_df)

### Join Demo

In [0]:
circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits").withColumnRenamed("name", "circuit_name")
display(circuits_df)

In [0]:
races_df = races_df.filter("race_year = 2019").withColumnRenamed("name", "race_name")
display(races_df)

In [0]:
race_circuits_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.circuit_id, "inner") \
.select(circuits_df.circuit_name, circuits_df.location, circuits_df.country, races_df.race_name, races_df.round)

In [0]:
display(race_circuits_df)

In [0]:
race_circuits_df.select("circuit_name").show()

In [0]:
# Left Outer Join
circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits")\
.filter("circuit_id < 70")\
.withColumnRenamed("name", "circuit_name")    
display(circuits_df)

In [0]:
# Left join.
# Output: 69 entries of circuits_df, 69 - -21 = 48 of them don't don't have number of rounds
race_circuits_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.circuit_id, "left") \
.select(circuits_df.circuit_id,circuits_df.circuit_name, circuits_df.location, circuits_df.country, races_df.race_name, races_df.round)
display(race_circuits_df)

In [0]:
# Right join.
# Output: All 21 entries of races_df. Austrian, Russian and Azerbaijan will be marked as null since we've cut the table
race_circuits_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.circuit_id, "right") \
.select(circuits_df.circuit_id, circuits_df.circuit_name, circuits_df.location, circuits_df.country, races_df.race_name, races_df.round)
display(race_circuits_df)

In [0]:
# Semi.
# Output: We only get the entries from circuit_id
race_circuits_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.circuit_id, "semi")
display(race_circuits_df)

###### Anti

In [0]:
# Anti.
# Output: We only get the entries from circuit_id, which don't have entries in races_id (opposite of semi join)
race_circuits_df = circuits_df.join(races_df, circuits_df.circuit_id == races_df.circuit_id, "anti")
display(race_circuits_df)

In [0]:
# Anti.
# Output: We only get the entries from races_id, which don't have entries in circuit_id.
# Azerbaijan, Russian, Austrian
race_circuits_df = races_df.join(circuits_df, circuits_df.circuit_id == races_df.circuit_id, "anti")
display(race_circuits_df)

In [0]:
# crossJoin - Cartesian Product, 69*21 = 1449
race_circuits_df = races_df.crossJoin(circuits_df)
display(race_circuits_df)