In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
# Create a SparkSession (without a specified name)
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True) #for simple calls and better display

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=c0aa904e3e80738adfde4281b6d8494c539914c917526d0a55b6252dc0db3e0a
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/28 12:38:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
processed_folder_path = '/kaggle/input/formula1-processed-hope'


drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers")
# constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors") 
# circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits") 
# races_df = spark.read.parquet(f"{processed_folder_path}/races") 
# results_df = spark.read.parquet(f"{processed_folder_path}/results")

                                                                                

In [3]:
drivers_df.schema

StructType([StructField('driver_id', IntegerType(), True), StructField('driver_ref', StringType(), True), StructField('number', IntegerType(), True), StructField('code', StringType(), True), StructField('name', StringType(), True), StructField('dob', DateType(), True), StructField('nationality', StringType(), True), StructField('ingestion_date', TimestampType(), True)])

In [4]:
df = drivers_df

In [5]:
from pyspark.sql.functions import count
# df.selectExpr("count(*)")
df.selectExpr("count(number)")

                                                                                

count(number)
47


In [6]:
from pyspark.sql.functions import countDistinct
df.select(countDistinct("number"))

count(DISTINCT number)
41


In [7]:
from pyspark.sql.functions import first,last
df.select(first("name"),last("name")) # first and last values from the dataframe, not first and last by sorting

first(name),last(name)
Lewis Hamilton,Mick Schumacher


In [8]:
from pyspark.sql.functions import max,min
df.select(min("dob"),max("dob"))

min(dob),max(dob)
1896-12-28,2000-05-11


In [9]:
from pyspark.sql.functions import sum
df.select(sum("driver_id"))

sum(driver_id)
364276


collect_set ensures no duplicates or it ensures uniqueness and it doesn't ensure order, collect_list will contain duplicates as well, it ensures the physical order

In [10]:
from pyspark.sql.functions import collect_list, collect_set
df.filter("nationality IN ('Indian','Belgian','Spanish')").select(collect_set("nationality")).show(truncate=False)

+--------------------------+
|collect_set(nationality)  |
+--------------------------+
|[Indian, Spanish, Belgian]|
+--------------------------+



In [11]:
from pyspark.sql.functions import collect_list, collect_set
df.filter("nationality IN ('Indian','Belgian','Spanish')").select(collect_list("nationality")).show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|collect_list(nationality)                                                                                                                                                                                                                                                                                                                                             |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
from pyspark.sql.functions import desc,year
df.groupBy("nationality",year("dob")).count().orderBy(desc("count"))

nationality,year(dob),count
American,1926,13
American,1927,8
American,1918,8
American,1920,8
American,1925,7
American,1928,7
British,1931,7
American,1919,7
British,1942,6
British,1929,6


Grouping data using agg for better control on the data

In [14]:
from pyspark.sql.functions import expr
#both give the same output
# df.groupBy("nationality").agg(count("driver_id").alias("count_driver")).orderBy(desc("count_driver"))
df.groupBy("nationality").agg(expr("count(driver_id)").alias("count_driver")).orderBy(desc("count_driver"))

nationality,count_driver
British,165
American,157
Italian,99
French,73
German,50
Brazilian,32
Argentine,24
Swiss,23
South African,23
Belgian,23


In [15]:
df.schema

StructType([StructField('driver_id', IntegerType(), True), StructField('driver_ref', StringType(), True), StructField('number', IntegerType(), True), StructField('code', StringType(), True), StructField('name', StringType(), True), StructField('dob', DateType(), True), StructField('nationality', StringType(), True), StructField('ingestion_date', TimestampType(), True)])

Window functions

In [16]:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc,asc,col,max,lag,lead,months_between

windowSpec = Window.partitionBy(col("nationality")).orderBy("dob")

df.select("name","nationality","dob",lag("dob",1).over(windowSpec).alias("prev"),months_between("dob",lag("dob",1).over(windowSpec)).alias("months_btwn"))

name,nationality,dob,prev,months_btwn
Chet Miller,American,1902-07-19,,
Mauri Rose,American,1906-05-26,1902-07-19,46.22580645
George Connor,American,1906-08-16,1906-05-26,2.67741935
Bill Holland,American,1907-12-18,1906-08-16,16.06451613
Bill Cantrell,American,1908-01-31,1907-12-18,1.41935484
Robert O'Brien,American,1908-04-11,1908-01-31,2.35483871
Bill Schindler,American,1909-03-06,1908-04-11,10.83870968
Jimmy Jackson,American,1910-07-25,1909-03-06,16.61290323
Lee Wallard,American,1910-09-07,1910-07-25,1.41935484
Travis Webb,American,1910-10-08,1910-09-07,1.03225806


### Rollup
`Rollup` in PySpark creates subtotals and grand totals along a hierarchical grouping of columns, providing a detailed summary at different levels of granularity. Ex: (1,2) equals (1,2),(1),()

### Cube
`Cube` in PySpark generates multi-dimensional aggregates across all possible combinations of specified grouping columns, offering comprehensive summaries including all possible subtotals and grand totals. Ex: (1,2) equals (1,2),(1),(2),()

In [25]:
from pyspark.sql.functions import lit,desc
df.where("nationality = 'American' and year(dob) < 1910").cube("nationality",year("dob")).agg(count(lit(1)).alias("cnt"))\
.orderBy("nationality","year(dob)").show(500)

+-----------+---------+---+
|nationality|year(dob)|cnt|
+-----------+---------+---+
|       NULL|     NULL|  7|
|       NULL|     1902|  1|
|       NULL|     1906|  2|
|       NULL|     1907|  1|
|       NULL|     1908|  2|
|       NULL|     1909|  1|
|   American|     NULL|  7|
|   American|     1902|  1|
|   American|     1906|  2|
|   American|     1907|  1|
|   American|     1908|  2|
|   American|     1909|  1|
+-----------+---------+---+



In [26]:
from pyspark.sql.functions import lit,desc
df.where("nationality = 'American' and year(dob) < 1910").rollup("nationality",year("dob")).agg(count(lit(1)).alias("cnt"))\
.orderBy("nationality","year(dob)").show(500)

+-----------+---------+---+
|nationality|year(dob)|cnt|
+-----------+---------+---+
|       NULL|     NULL|  7|
|   American|     NULL|  7|
|   American|     1902|  1|
|   American|     1906|  2|
|   American|     1907|  1|
|   American|     1908|  2|
|   American|     1909|  1|
+-----------+---------+---+



In [27]:
from pyspark.sql.functions import lit,desc,grouping_id
df.where("nationality = 'American' and year(dob) < 1910").rollup("nationality",year("dob")).agg(grouping_id(),count(lit(1)).alias("cnt"))\
.orderBy("nationality","year(dob)").show(500)

+-----------+---------+-------------+---+
|nationality|year(dob)|grouping_id()|cnt|
+-----------+---------+-------------+---+
|       NULL|     NULL|            3|  7|
|   American|     NULL|            1|  7|
|   American|     1902|            0|  1|
|   American|     1906|            0|  2|
|   American|     1907|            0|  1|
|   American|     1908|            0|  2|
|   American|     1909|            0|  1|
+-----------+---------+-------------+---+



In [28]:
from pyspark.sql.functions import lit,desc,grouping_id
df.where("nationality = 'American' and year(dob) < 1910").cube("nationality",year("dob")).agg(grouping_id(),count(lit(1)).alias("cnt"))\
.orderBy("nationality","year(dob)").show(500)

+-----------+---------+-------------+---+
|nationality|year(dob)|grouping_id()|cnt|
+-----------+---------+-------------+---+
|       NULL|     NULL|            3|  7|
|       NULL|     1902|            2|  1|
|       NULL|     1906|            2|  2|
|       NULL|     1907|            2|  1|
|       NULL|     1908|            2|  2|
|       NULL|     1909|            2|  1|
|   American|     NULL|            1|  7|
|   American|     1902|            0|  1|
|   American|     1906|            0|  2|
|   American|     1907|            0|  1|
|   American|     1908|            0|  2|
|   American|     1909|            0|  1|
+-----------+---------+-------------+---+

