In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
# Create a SparkSession (without a specified name)
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True) #for simple calls and better display

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=6785e88c31e7b21c8215fa09bb914ffc10699427c0b07652e6be6a758cbfeb46
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/21 05:29:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
processed_folder_path = '/kaggle/input/formula1-processed-hope'


drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers")
constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors") 
circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits") 
races_df = spark.read.parquet(f"{processed_folder_path}/races") 
results_df = spark.read.parquet(f"{processed_folder_path}/results")

                                                                                

In [3]:
drivers_df.schema

StructType([StructField('driver_id', IntegerType(), True), StructField('driver_ref', StringType(), True), StructField('number', IntegerType(), True), StructField('code', StringType(), True), StructField('name', StringType(), True), StructField('dob', DateType(), True), StructField('nationality', StringType(), True), StructField('ingestion_date', TimestampType(), True)])

In [4]:
drivers_df.selectExpr("driver_id","code as `driv sd`")

                                                                                

driver_id,driv sd
1,HAM
2,HEI
3,ROS
4,ALO
5,KOV
6,NAK
7,BOU
8,RAI
9,KUB
10,GLO


In [5]:
drivers_df.columns

['driver_id',
 'driver_ref',
 'number',
 'code',
 'name',
 'dob',
 'nationality',
 'ingestion_date']

In [6]:
from pyspark.sql.functions import expr
drivers_df.select(expr("driver_id as id"))

id
1
2
3
4
5
6
7
8
9
10


In [7]:
from pyspark.sql.functions import col
drivers_df.withColumn("code_as_LONG",col("code").cast("long"))
## as we can see this cast won't work, but it is not throwing an error

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date,code_as_LONG
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...,
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...,
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...,
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2024-06-11 01:24:...,
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2024-06-11 01:24:...,
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,2024-06-11 01:24:...,
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,2024-06-11 01:24:...,
8,raikkonen,7.0,RAI,Kimi Räikkönen,1979-10-17,Finnish,2024-06-11 01:24:...,
9,kubica,88.0,KUB,Robert Kubica,1984-12-07,Polish,2024-06-11 01:24:...,
10,glock,,GLO,Timo Glock,1982-03-18,German,2024-06-11 01:24:...,


In [8]:
drivers_df.where("driver_id < 12")

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2024-06-11 01:24:...
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2024-06-11 01:24:...
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,2024-06-11 01:24:...
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,2024-06-11 01:24:...
8,raikkonen,7.0,RAI,Kimi Räikkönen,1979-10-17,Finnish,2024-06-11 01:24:...
9,kubica,88.0,KUB,Robert Kubica,1984-12-07,Polish,2024-06-11 01:24:...
10,glock,,GLO,Timo Glock,1982-03-18,German,2024-06-11 01:24:...


In [9]:
## Both of them give the same result

#drivers_df.filter(col("driver_id")<8)
drivers_df.filter("driver_id<8")

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2024-06-11 01:24:...
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2024-06-11 01:24:...
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,2024-06-11 01:24:...
7,bourdais,,BOU,Sébastien Bourdais,1979-02-28,French,2024-06-11 01:24:...


In [10]:
# you can chain multiple filter and where functions. They are combined with "AND"
drivers_df.filter("nationality = 'British'").where("driver_id < 20")

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...
14,coulthard,,COU,David Coulthard,1971-03-27,British,2024-06-11 01:24:...
18,button,22.0,BUT,Jenson Button,1980-01-19,British,2024-06-11 01:24:...
19,davidson,,DAV,Anthony Davidson,1979-04-18,British,2024-06-11 01:24:...


In [12]:
drivers_df.select("nationality").distinct().count()
#drivers_df.select("nationality").count()

41

Sampling and putting some data into two different dataframes using fraction between(0 and 1)

In [28]:
d_df1 = drivers_df.sample(0.1)
d_df2 = drivers_df.sample(0.5)

Unioning them into a single dataframe

In [30]:
d_df1.union(d_df2).count()

530

In [44]:
from pyspark.sql.functions import desc,asc
# drivers_df.orderBy(col("dob").desc())
drivers_df.orderBy(expr("dob desc"))

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
741,etancelin,,\N,Philippe Étancelin,1896-12-28,French,2024-06-11 01:24:...
786,fagioli,,\N,Luigi Fagioli,1898-06-09,Italian,2024-06-11 01:24:...
791,biondetti,,\N,Clemente Biondetti,1898-08-18,Italian,2024-06-11 01:24:...
703,legat,,\N,Arthur Legat,1898-11-01,Belgian,2024-06-11 01:24:...
589,chiron,,\N,Louis Chiron,1899-08-03,Monegasque,2024-06-11 01:24:...
760,dusio,,\N,Piero Dusio,1899-10-13,Italian,2024-06-11 01:24:...
750,brudes,,\N,Adolf Brudes,1899-10-15,German,2024-06-11 01:24:...
749,aston,,\N,Bill Aston,1900-03-29,British,2024-06-11 01:24:...
717,hans_stuck,,\N,Hans von Stuck,1900-12-27,German,2024-06-11 01:24:...
755,schoeller,,\N,Rudolf Schoeller,1902-04-27,Swiss,2024-06-11 01:24:...


In [46]:
drivers_df.limit(5)

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2024-06-11 01:24:...
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2024-06-11 01:24:...
