In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
# Create a SparkSession (without a specified name)
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.repl.eagerEval.enabled', True) #for simple calls and better display

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488493 sha256=9840637063a31fda3d2bbc337a10e6eea1db5d037cb4f492189b3d695cf7c063
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/23 02:36:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
processed_folder_path = '/kaggle/input/formula1-processed-hope'


drivers_df = spark.read.parquet(f"{processed_folder_path}/drivers")
# constructors_df = spark.read.parquet(f"{processed_folder_path}/constructors") 
# circuits_df = spark.read.parquet(f"{processed_folder_path}/circuits") 
# races_df = spark.read.parquet(f"{processed_folder_path}/races") 
# results_df = spark.read.parquet(f"{processed_folder_path}/results")

                                                                                

In [3]:
drivers_df.schema

StructType([StructField('driver_id', IntegerType(), True), StructField('driver_ref', StringType(), True), StructField('number', IntegerType(), True), StructField('code', StringType(), True), StructField('name', StringType(), True), StructField('dob', DateType(), True), StructField('nationality', StringType(), True), StructField('ingestion_date', TimestampType(), True)])

In [4]:
df = drivers_df

In [5]:
from pyspark.sql.functions import lit
df.select(lit(5)).limit(2)

5
5
5


In [6]:
from pyspark.sql.functions import col

# Both the below statements give the same results

df.where(col("driver_id")<7)
# df.where("driver_id<7")

                                                                                

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
1,hamilton,44.0,HAM,Lewis Hamilton,1985-01-07,British,2024-06-11 01:24:...
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...
4,alonso,14.0,ALO,Fernando Alonso,1981-07-29,Spanish,2024-06-11 01:24:...
5,kovalainen,,KOV,Heikki Kovalainen,1981-10-19,Finnish,2024-06-11 01:24:...
6,nakajima,,NAK,Kazuki Nakajima,1985-01-11,Japanese,2024-06-11 01:24:...


In [7]:
df.where("nationality = 'German'").limit(5)

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...
10,glock,,GLO,Timo Glock,1982-03-18,German,2024-06-11 01:24:...
16,sutil,99.0,SUT,Adrian Sutil,1983-01-11,German,2024-06-11 01:24:...
20,vettel,5.0,VET,Sebastian Vettel,1987-07-03,German,2024-06-11 01:24:...


In [8]:
col("driver_id")

Column<'driver_id'>

In [9]:
df.where(df.driver_id.isin(2,3))

driver_id,driver_ref,number,code,name,dob,nationality,ingestion_date
2,heidfeld,,HEI,Nick Heidfeld,1977-05-10,German,2024-06-11 01:24:...
3,rosberg,6.0,ROS,Nico Rosberg,1985-06-27,German,2024-06-11 01:24:...


In [10]:
from pyspark.sql.functions import expr
df.withColumn("fromBritain",expr("nationality = 'British'")).where(df.nationality.isin("British","German"))\
.select("driver_id","name","nationality","fromBritain").show(3)

+---------+--------------+-----------+-----------+
|driver_id|          name|nationality|fromBritain|
+---------+--------------+-----------+-----------+
|        1|Lewis Hamilton|    British|       true|
|        2| Nick Heidfeld|     German|      false|
|        3|  Nico Rosberg|     German|      false|
+---------+--------------+-----------+-----------+
only showing top 3 rows



In [11]:
from pyspark.sql.functions import expr, pow

#Both statements give the same result

squaredDriverID = pow(col("driver_id"),2)
# df.select(expr("driver_id"),squaredDriverID.alias("DriverID_Squared"))
df.select("driver_id",squaredDriverID.alias("DriverID_Squared")).show(3)

+---------+----------------+
|driver_id|DriverID_Squared|
+---------+----------------+
|        1|             1.0|
|        2|             4.0|
|        3|             9.0|
+---------+----------------+
only showing top 3 rows



In [12]:
from pyspark.sql.functions import round,bround,lit
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [13]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark.range(10).withColumn("today",current_date()).withColumn("now",current_timestamp())
dateDF.show(3)

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2024-07-23|2024-07-23 02:37:...|
|  1|2024-07-23|2024-07-23 02:37:...|
|  2|2024-07-23|2024-07-23 02:37:...|
+---+----------+--------------------+
only showing top 3 rows



In [14]:
from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2024-07-18|        2024-07-28|
+------------------+------------------+
only showing top 1 row



In [15]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
.select(datediff("week_ago","today")).show(1)
# .select(datediff(col("week_ago"), col("today"))).show(1)  #both work

dateDF.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-01-01")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                           -12.0|
+--------------------------------+
only showing top 1 row



In [16]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

+-------------------+-------------------+
|to_date(2016-20-12)|to_date(2017-12-11)|
+-------------------+-------------------+
|               NULL|         2017-12-11|
+-------------------+-------------------+
only showing top 1 row



In [17]:
from pyspark.sql.functions import to_date

dateFormat = "yyyy-dd-MM"

cleanDateDF = spark.range(1).select(
to_date(lit("2016-12-11"),dateFormat).alias("date"),
to_date(lit("2016-10-21"),dateFormat).alias("date2"), # this shows a NULL as there is no 20th Month, but it also doesn't throw any error
)

cleanDateDF.show(1)

+----------+-----+
|      date|date2|
+----------+-----+
|2016-11-12| NULL|
+----------+-----+



In [18]:
from pyspark.sql.functions import to_timestamp

cleanDateDF.select(to_timestamp(col("date"),dateFormat)).show(1) # to_timestamp the dateFormat is necessary

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2016-11-12 00:00:00|
+------------------------------+



In [19]:
cleanDateDF.filter(col("date")<lit("2020-01-01")).show(1)

+----------+-----+
|      date|date2|
+----------+-----+
|2016-11-12| NULL|
+----------+-----+



In [20]:
from pyspark.sql.functions import coalesce

cleanDateDF.select(coalesce("date2","date"))

"coalesce(date2, date)"
2016-11-12


In [21]:
from pyspark.sql.functions import ifnull
df.select(ifnull(lit(None),lit("happy"))).show(1)   #to give a NULL manually in python use 'None'

+-------------------+
|ifnull(NULL, happy)|
+-------------------+
|              happy|
+-------------------+
only showing top 1 row



In [22]:
cleanDateDF.na.drop("all") 

date,date2
2016-11-12,


In [23]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("number","code").alias("complexColumn"))
# complexDF.select("complexColumn.code")
complexDF.select("complexColumn.*").show(3)

+------+----+
|number|code|
+------+----+
|    44| HAM|
|  NULL| HEI|
|     6| ROS|
+------+----+
only showing top 3 rows



In [24]:
from pyspark.sql.functions import split
df.select(split("name"," ").alias("split_col")).selectExpr("split_col[0]").show(3)

+------------+
|split_col[0]|
+------------+
|       Lewis|
|        Nick|
|        Nico|
+------------+
only showing top 3 rows



In [25]:
from pyspark.sql.functions import split,explode
# df.select(split("name"," ").alias("split_col")).selectExpr("split_col[0]")
split_df = df.select("driver_id",split("name"," ").alias("split_col"))
split_df.select("driver_id",explode("split_col")).show(3)

+---------+--------+
|driver_id|     col|
+---------+--------+
|        1|   Lewis|
|        1|Hamilton|
|        2|    Nick|
+---------+--------+
only showing top 3 rows



In [26]:
from pyspark.sql.functions import create_map

#Maps are created by using the map function and key-value pairs of columns. 
#You then can select them just like you might select from an array, but using the key value

map_df = df.select("driver_id","code",create_map(col("driver_id"),col("name")).alias("map_col")).limit(3)
map_df.show()
type(map_df)

+---------+----+--------------------+
|driver_id|code|             map_col|
+---------+----+--------------------+
|        1| HAM|{1 -> Lewis Hamil...|
|        2| HEI|{2 -> Nick Heidfeld}|
|        3| ROS| {3 -> Nico Rosberg}|
+---------+----+--------------------+



pyspark.sql.dataframe.DataFrame

In [27]:
map_df.select(map_df.map_col[1])  #displays for all the 3 rows

map_col[1]
Lewis Hamilton
""
""


In [28]:
map_df.select(map_df.map_col[1]).where("map_col[1] is NOT NULL") #filtering for only rows where it is NOT NULL

map_col[1]
Lewis Hamilton


In [29]:
map_df.select("code",explode("map_col"))  # we can also explode the map columns, the "key" and "value" column names are automatically assigned

code,key,value
HAM,1,Lewis Hamilton
HEI,2,Nick Heidfeld
ROS,3,Nico Rosberg


In [30]:
spark.range(1).selectExpr("1")

1
1


In [31]:
jsonDF = spark.range(1).selectExpr("'col_text' as col_name","""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")
jsonDF.show(truncate=False) #to show the complete column without truncation

+--------+-------------------------------------------+
|col_name|jsonString                                 |
+--------+-------------------------------------------+
|col_text|{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}|
+--------+-------------------------------------------+



In [32]:
from pyspark.sql.functions import json_tuple, get_json_object

# get_json_object to inline query a JSON object, be it a dictionary or array. You can use json_tuple if this object has only one level of nesting

# jsonDF.select(get_json_object("jsonString","$.myJSONKey.myJSONValue")).show(truncate=False)
jsonDF\
    .select(\
            get_json_object("jsonString","$.myJSONKey.myJSONValue[1]").alias("col1"),
            json_tuple("jsonString","myJSONKey").alias("col2")\
           )\
    .show(truncate=False)

+----+-----------------------+
|col1|col2                   |
+----+-----------------------+
|2   |{"myJSONValue":[1,2,3]}|
+----+-----------------------+



#### User Defined functions

In [33]:
udfDF = spark.range(5).toDF("num")

def power3(value):
    return value**3

power3(2)

8

In [34]:
from pyspark.sql.functions import udf

power3udf = udf(power3)
udfDF.select(power3udf("num"))

                                                                                

power3(num)
0
1
8
27
64


In [35]:
#registering the udf to be used with SQL expressions, which was not possible previously

from pyspark.sql.types import StringType
spark.udf.register("power3udfSQL",power3,StringType())
udfDF.selectExpr("power3udfSQL(num)")

                                                                                

power3udfSQL(num)
0
1
8
27
64
