In [8]:
# Check env vars
!env | grep -e "SPARK" -e "PYTHON"

PYSPARK_DRIVER_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYSPARK_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYTHONPATH=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/pyspark.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/*.zip:
SPARK_HOME=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3
PYTHONUNBUFFERED=1
PYTHONIOENCODING=utf-8
PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING=1


In [9]:
# Create a spark session
from pyspark.sql import SparkSession

spark = (
            SparkSession.builder.appName("learn_dataframes")
                .master("local[4]")
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")
                .getOrCreate()
        )

sc = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/12/27 11:04:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
# Add scrollbars to data for display
from IPython.core.display import display, HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

  from IPython.core.display import display, HTML


In [11]:
# Create RDD of employees
data = [
    [1, "John", 10000],
    [2, "Jane", 15000],
    [3, "Joe", 5000],
    [4, "Mary", 20000],
    [5, "Mike", 25000]
]

employeesRdd = sc.parallelize(data)

In [14]:
# Create a DataFrame and show content
employeesDf = employeesRdd.toDF(["id", "name", "salary"])

employeesDf.show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|John| 10000|
|  2|Jane| 15000|
|  3| Joe|  5000|
|  4|Mary| 20000|
|  5|Mike| 25000|
+---+----+------+



In [15]:
# Print DataFrame schema
employeesDf.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



In [17]:
# Creat dataframe from collection directly
employeesRdd = spark.createDataFrame(data, ["id", "name", "salary"])

employeesRdd.show()

+---+----+------+
| id|name|salary|
+---+----+------+
|  1|John| 10000|
|  2|Jane| 15000|
|  3| Joe|  5000|
|  4|Mary| 20000|
|  5|Mike| 25000|
+---+----+------+



In [18]:
# Create dataframe from a file
employeesDf = spark.read.csv("data/employees.csv", header=True, inferSchema=True)

employeesDf.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| John| 10000|
|  2|Smith| 20000|
|  3| Mark| 30000|
|  4|David| 40000|
|  5| Paul| 50000|
+---+-----+------+



In [19]:
# check schema
employeesDf.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [20]:
# Read a tab separated file
employeesDf = spark.read.csv("data/employees.tsv", header=True, inferSchema=True, sep="\t")

employeesDf.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| John| 10000|
|  2|Smith| 20000|
|  3| Mark| 30000|
|  4|David| 40000|
|  5| Paul| 50000|
+---+-----+------+



In [21]:
# Define schema manually
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("salary", DoubleType(), True)
])

employeesDf = spark.read.csv("data/employees.csv", header=True, schema=schema)

employeesDf.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  1| John|10000.0|
|  2|Smith|20000.0|
|  3| Mark|30000.0|
|  4|David|40000.0|
|  5| Paul|50000.0|
+---+-----+-------+



In [24]:
# Read a multiline json file
routesDf = spark.read.json("data/taxi_routes.json", multiLine=True)

routesDf.show()

+---------+----------+--------------------+------+--------+--------------------+
| distance|  duration|           end_point|  fare|route_id|         start_point|
+---------+----------+--------------------+------+--------+--------------------+
|5.6 miles|20 minutes|{40.768437, -73.9...|$15.00|       1|{40.712776, -74.0...|
|0.7 miles| 5 minutes|{40.748817, -73.9...| $4.00|       2|{40.758896, -73.9...|
|0.0 miles| 0 minutes|{40.748817, -73.9...| $2.50|       3|{40.748817, -73.9...|
|0.0 miles| 0 minutes|{40.748817, -73.9...| $2.50|       4|{40.748817, -73.9...|
|0.0 miles| 0 minutes|{40.748817, -73.9...| $2.50|       5|{40.748817, -73.9...|
|0.0 miles| 0 minutes|{40.748817, -73.9...| $2.50|       6|{40.748817, -73.9...|
|0.0 miles| 0 minutes|{40.748817, -73.9...| $2.50|       7|{40.748817, -73.9...|
+---------+----------+--------------------+------+--------+--------------------+



In [25]:
# print schema
routesDf.printSchema()

root
 |-- distance: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- end_point: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- fare: string (nullable = true)
 |-- route_id: string (nullable = true)
 |-- start_point: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)



In [26]:
# Define our own schema for the json and read file using that
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, StructType

schema = StructType([
    StructField("route_id", StringType(), True),
    StructField("start_point", StructType([
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True)
    ]), True),
    StructField("end_point", StructType([
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True)
    ]), True),
    StructField("distance", StringType(), True),
    StructField("duration", StringType(), True),
    StructField("fare", StringType(), True)
])

taxiRoutesDf = spark.read.json("data/taxi_routes.json", multiLine=True, schema=schema)

taxiRoutesDf.show()

+--------+--------------------+--------------------+---------+----------+------+
|route_id|         start_point|           end_point| distance|  duration|  fare|
+--------+--------------------+--------------------+---------+----------+------+
|       1|{40.712776, -74.0...|{40.768437, -73.9...|5.6 miles|20 minutes|$15.00|
|       2|{40.758896, -73.9...|{40.748817, -73.9...|0.7 miles| 5 minutes| $4.00|
|       3|{40.748817, -73.9...|{40.748817, -73.9...|0.0 miles| 0 minutes| $2.50|
|       4|{40.748817, -73.9...|{40.748817, -73.9...|0.0 miles| 0 minutes| $2.50|
|       5|{40.748817, -73.9...|{40.748817, -73.9...|0.0 miles| 0 minutes| $2.50|
|       6|{40.748817, -73.9...|{40.748817, -73.9...|0.0 miles| 0 minutes| $2.50|
|       7|{40.748817, -73.9...|{40.748817, -73.9...|0.0 miles| 0 minutes| $2.50|
+--------+--------------------+--------------------+---------+----------+------+



In [27]:
# Print schema
taxiRoutesDf.printSchema()

root
 |-- route_id: string (nullable = true)
 |-- start_point: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- end_point: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- distance: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- fare: string (nullable = true)

