In [1]:
# Check env vars
!env | grep -e "SPARK" -e "PYTHON"

PYSPARK_DRIVER_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYSPARK_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYTHONPATH=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/pyspark.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/*.zip:
SPARK_HOME=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3
PYTHONUNBUFFERED=1
PYTHONIOENCODING=utf-8
PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING=1


In [2]:
# Add scrollbars to data for display
from IPython.display import display
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

In [3]:
# Create a spark session
from pyspark.sql import SparkSession

spark = (
            SparkSession.builder.appName("learn_dataframes")
                .master("local[4]")
                .getOrCreate()
        )

sc = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/12/27 15:21:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Read a csv file into a dataframe using a sql schema
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

schema = "City STRING, State STRING, Country STRING, ZipCode LONG, Population INT"

citiesDF = (
    spark.read.format("csv")
    .option("header", False)
    .schema(schema)
    .load("data/cities.csv")
)

citiesDF.show()

+-------------+-----+-------+-------+----------+
|         City|State|Country|ZipCode|Population|
+-------------+-----+-------+-------+----------+
|      Seattle|   WA|    USA|  98101|    652405|
|   Bellingham|   WA|    USA|  98225|     82235|
|     Portland|   OR|    USA|  97201|    609456|
|       Eugene|   OR|    USA|  97401|    221452|
|San Francisco|   CA|    USA|  94101|    837442|
|  Los Angeles|   CA|    USA|  90001|   3884307|
+-------------+-----+-------+-------+----------+



In [5]:
# Select only WA cities using sql syntax
citiesDF.createOrReplaceTempView("cities")

citiesWA = spark.sql("SELECT * FROM cities WHERE State = 'WA'")

citiesWA.show()

+----------+-----+-------+-------+----------+
|      City|State|Country|ZipCode|Population|
+----------+-----+-------+-------+----------+
|   Seattle|   WA|    USA|  98101|    652405|
|Bellingham|   WA|    USA|  98225|     82235|
+----------+-----+-------+-------+----------+



In [6]:
# Save this data to a managed table
citiesWA.write.saveAsTable("citieswa")

# Select from the managed table
spark.sql("SELECT * FROM citieswa").show()

                                                                                

+----------+-----+-------+-------+----------+
|      City|State|Country|ZipCode|Population|
+----------+-----+-------+-------+----------+
|   Seattle|   WA|    USA|  98101|    652405|
|Bellingham|   WA|    USA|  98225|     82235|
+----------+-----+-------+-------+----------+



In [7]:
# Describe the table
spark.sql("DESCRIBE TABLE EXTENDED citieswa").show(truncate=False)

+----------------------------+-------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                            |comment|
+----------------------------+-------------------------------------------------------------------------------------+-------+
|City                        |string                                                                               |null   |
|State                       |string                                                                               |null   |
|Country                     |string                                                                               |null   |
|ZipCode                     |bigint                                                                               |null   |
|Population                  |int                                                                                  |null   |


In [8]:
# Save this data to an unmanaged table by specifying the data location
import os

(
    citiesWA.write
        .option("path", f"{os.getcwd()}/output/citieswa")
        .option("format", "parquet")
        .mode("overwrite")
        .saveAsTable("citieswaunmanaged")
)

# Select from the unmanaged table
spark.sql("SELECT * FROM citieswaunmanaged").show()

+----------+-----+-------+-------+----------+
|      City|State|Country|ZipCode|Population|
+----------+-----+-------+-------+----------+
|   Seattle|   WA|    USA|  98101|    652405|
|Bellingham|   WA|    USA|  98225|     82235|
+----------+-----+-------+-------+----------+



In [9]:
# Describe the table
spark.sql("DESCRIBE TABLE EXTENDED citieswaunmanaged").show(truncate=False)

+----------------------------+------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                     |comment|
+----------------------------+------------------------------------------------------------------------------+-------+
|City                        |string                                                                        |null   |
|State                       |string                                                                        |null   |
|Country                     |string                                                                        |null   |
|ZipCode                     |bigint                                                                        |null   |
|Population                  |int                                                                           |null   |
|                            |                          

In [10]:
# Drop both tables, note that the underlying data for the managed table is also deleted, but not for the unmanaged table
spark.sql("DROP TABLE citieswa")
spark.sql("DROP TABLE citieswaunmanaged")

DataFrame[]

In [11]:
# Create the table using sql
spark.sql(f"""
CREATE TABLE citieswa
USING parquet
LOCATION "{os.getcwd()}/output/citieswa"
""")

DataFrame[]

In [12]:
# Describe the table
spark.sql("DESCRIBE TABLE EXTENDED citieswa").show(truncate=False)

+----------------------------+------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                     |comment|
+----------------------------+------------------------------------------------------------------------------+-------+
|City                        |string                                                                        |null   |
|State                       |string                                                                        |null   |
|Country                     |string                                                                        |null   |
|ZipCode                     |bigint                                                                        |null   |
|Population                  |int                                                                           |null   |
|                            |                          

In [13]:
# Select from the table
spark.sql("SELECT * FROM citieswa").show(truncate=False)

+----------+-----+-------+-------+----------+
|City      |State|Country|ZipCode|Population|
+----------+-----+-------+-------+----------+
|Seattle   |WA   |USA    |98101  |652405    |
|Bellingham|WA   |USA    |98225  |82235     |
+----------+-----+-------+-------+----------+



In [14]:
# Add data from citiesWA data frame to the table
citiesWA.createOrReplaceTempView("citieswa_view")
spark.sql("""
INSERT INTO table citieswa SELECT * FROM citieswa_view
""")

DataFrame[]

In [15]:
# Select from the table
spark.sql("SELECT * FROM citieswa").show(truncate=False)

+----------+-----+-------+-------+----------+
|City      |State|Country|ZipCode|Population|
+----------+-----+-------+-------+----------+
|Seattle   |WA   |USA    |98101  |652405    |
|Bellingham|WA   |USA    |98225  |82235     |
|Seattle   |WA   |USA    |98101  |652405    |
|Bellingham|WA   |USA    |98225  |82235     |
+----------+-----+-------+-------+----------+

