# 04 More transformations

- Create a dataframe from a python list
- Add a computed column to the dataframe
- Remove a column from the dataframe
- Save the dataframe to a table
- Issue an SQL command to the table

In [1]:
# 1. Create a spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.version

'4.0.1'

In [2]:
# 2. Create a dataframe from a python list and a schema

from datetime import date

students_schema = 'id long, name string, enrolment_date date, gpa double'
students_data = [(1, "Nikos Zikos", date(2025, 9, 1), 7.7),
                 (2, "Maria Pappa", date(2025, 9, 2), 6.7),
                 (3, "Petros Kokkinos", date(2025, 9, 1), 4.6)]
students_df = spark.createDataFrame(students_data, students_schema)
students_df.show()

+---+---------------+--------------+---+
| id|           name|enrolment_date|gpa|
+---+---------------+--------------+---+
|  1|    Nikos Zikos|    2025-09-01|7.7|
|  2|    Maria Pappa|    2025-09-02|6.7|
|  3|Petros Kokkinos|    2025-09-01|4.6|
+---+---------------+--------------+---+



In [5]:
# 3. Add a new column named "status" that will assume value "PASS" when gpa >= 5 and value "FAIL" otherwise

from pyspark.sql.functions import expr

students_df2 = students_df.withColumn(
    "status", expr("case when gpa >= 5 then 'PASS' else 'FAIL' end")
)

students_df2.printSchema()
students_df2.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- enrolment_date: date (nullable = true)
 |-- gpa: double (nullable = true)
 |-- status: string (nullable = false)

+---+---------------+--------------+---+------+
| id|           name|enrolment_date|gpa|status|
+---+---------------+--------------+---+------+
|  1|    Nikos Zikos|    2025-09-01|7.7|  PASS|
|  2|    Maria Pappa|    2025-09-02|6.7|  PASS|
|  3|Petros Kokkinos|    2025-09-01|4.6|  FAIL|
+---+---------------+--------------+---+------+



In [None]:
# 4. Drop column gpa

students_df3 = students_df2.drop("gpa")
students_df3.printSchema()
students_df3.show() 

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- enrolment_date: date (nullable = true)
 |-- status: string (nullable = false)

+---+---------------+--------------+------+
| id|           name|enrolment_date|status|
+---+---------------+--------------+------+
|  1|    Nikos Zikos|    2025-09-01|  PASS|
|  2|    Maria Pappa|    2025-09-02|  PASS|
|  3|Petros Kokkinos|    2025-09-01|  FAIL|
+---+---------------+--------------+------+



In [7]:
# 5.  Write dataframe as table "students_orc" using the ORC format 

students_df3.write.mode("overwrite").format("orc").saveAsTable("students_orc")

In [None]:
# 6. Issue an SQL command that returns the names of the students that have status equal to PASS

spark.sql("select name from students_orc where status = 'PASS'").show()


+-----------+
|       name|
+-----------+
|Maria Pappa|
|Nikos Zikos|
+-----------+



In [10]:
# 7. Ditto with the DataFrame API (use an LLM!)

from pyspark.sql.functions import col

(
    spark.table("students_orc")
         .filter(col("status") == "PASS")
         .select("name")
         .show()
)

+-----------+
|       name|
+-----------+
|Maria Pappa|
|Nikos Zikos|
+-----------+

