In [1]:
# Check env vars
!env | grep -e "SPARK" -e "PYTHON"

PYSPARK_DRIVER_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYSPARK_PYTHON=/Users/c11309a/.local/share/rtx/installs/python/3.10/bin/python
PYTHONPATH=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/pyspark.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip:/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3/python/lib/*.zip:
SPARK_HOME=/Users/c11309a/Tools/spark-3.3.4-bin-hadoop3
PYTHONUNBUFFERED=1
PYTHONIOENCODING=utf-8
PYDEVD_IPYTHON_COMPATIBLE_DEBUGGING=1


In [2]:
# Create a spark session
from pyspark.sql import SparkSession

spark = (
            SparkSession.builder.appName("learn_dataframes")
                .master("local[4]")
                .getOrCreate()
        )

sc = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/12/27 17:37:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/27 17:37:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Create a udf
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType, IntegerType

def get_first_letter(word):
    return word[0]

get_first_letter_udf = udf(get_first_letter, StringType())

In [4]:
# Create a dataframe
from pyspark.sql.functions import col

df = spark.createDataFrame(
    [
        ("cat",),
        ("elephant",),
        ("rat",),
        ("rat",),
        ("cat",),
    ],
    ["word"]
)

df.show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------+
|    word|
+--------+
|     cat|
|elephant|
|     rat|
|     rat|
|     cat|
+--------+



                                                                                

In [5]:
# Use the udf to add a column to the dataframe
df.withColumn("first_letter", get_first_letter_udf(col("word"))).show()

                                                                                

+--------+------------+
|    word|first_letter|
+--------+------------+
|     cat|           c|
|elephant|           e|
|     rat|           r|
|     rat|           r|
|     cat|           c|
+--------+------------+



In [7]:
# Create a dataframe with a list full_addresses that include street, city, state, zip in a single column
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

df = spark.createDataFrame(
    [
        ("123 Main St, Buffalo NY 14201",),
        ("456 Pine St, Bellingham WA 98226",),
        ("789 Maple St, Sacramento CA 94203",),
    ],
    ["full_address"]
)

df.show(truncate=False)

+---------------------------------+
|full_address                     |
+---------------------------------+
|123 Main St, Buffalo NY 14201    |
|456 Pine St, Bellingham WA 98226 |
|789 Maple St, Sacramento CA 94203|
+---------------------------------+



In [8]:
# Create a udf to parse the full_address column into a struct with street, city, state, zip
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

def parse_address(full_address):
    street, city_state = full_address.split(", ")
    city, state, zip = city_state.split(" ")
    return (street, city, state, zip)

parse_address_udf = udf(parse_address, StructType([
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True),
]))

# Use the udf to create a new dataframe with the parsed address in four separate columns
df = (
    df.withColumn("address", parse_address_udf(col("full_address"))).select(
        col("address.street"),
        col("address.city"),
        col("address.state"),
        col("address.zip"),
    )
)

df.show(truncate=False)

+------------+----------+-----+-----+
|street      |city      |state|zip  |
+------------+----------+-----+-----+
|123 Main St |Buffalo   |NY   |14201|
|456 Pine St |Bellingham|WA   |98226|
|789 Maple St|Sacramento|CA   |94203|
+------------+----------+-----+-----+



In [19]:
# Read yaml file width fixed width field definitions
import yaml
fwf_fields = []
with open("data/fwf_fields.yaml", "r") as f:
    fwf_fields = yaml.load(f, yaml.SafeLoader)['fields']
    
fwf_fields

[{'name': 'id', 'length': 5, 'type': 'integer'},
 {'name': 'name', 'length': 10, 'type': 'string'},
 {'name': 'age', 'length': 3, 'type': 'integer'}]

In [53]:
# Build schema and list of field lengths
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DataType

schema = StructType()
fields = []

def get_type(type: str) -> DataType:
    if type == "string":
        return StringType()
    elif type == "integer":
        return IntegerType()
    else:
        return StringType()

# add fields to schema and lengths to lengths list
for fdef in fwf_fields:
    field = StructField(fdef["name"], get_type(fdef['type']))
    fields.append((
        StructField(fdef["name"], get_type(fdef['type']), True), 
        fdef["length"]
    ))
    schema.add(field)

print(schema)
fields

StructType([StructField('id', IntegerType(), True), StructField('name', StringType(), True), StructField('age', IntegerType(), True)])


[(StructField('id', IntegerType(), True), 5),
 (StructField('name', StringType(), True), 10),
 (StructField('age', IntegerType(), True), 3)]

In [54]:
# Create udf from lengths list that parses a line into a list of values
from typing import List, Tuple

def parse_line_factory(fields: Tuple[StructField, int]):
    def parse_line(line: str) -> List[str]:
        i = 0
        values = []
        for f in fields:
            
            # get type of field
            t = f[0].dataType
            
            # get length of field
            l = f[1]
            
            strVal = line[i:i+l].strip()
            
            # convert to correct type
            if t == IntegerType():
                v = int(strVal)
            else:
                v = strVal
            
            values.append(v)
            i += l
        return values
    return parse_line

parse_line_udf = udf(parse_line_factory(fields), schema)

In [55]:
# Test parse_line function
parse_line_factory(fields)("1    John Doe  025")

[1, 'John Doe', 25]

In [61]:
# Create a dataframe from the fixed width file data/people.txt
df = spark.read.text("data/people.txt")
df.show(truncate=False)

+------------------+
|value             |
+------------------+
|1    John Doe  025|
|2    Jane Doe  030|
|3    Jim Smith 035|
|4    Ann Brown 040|
|5    Tom Davis 045|
+------------------+



In [62]:
# Use the udf to parse the lines into a dataframe
df = df.withColumn("parsed", parse_line_udf(col("value")))
df.show(truncate=False)

+------------------+------------------+
|value             |parsed            |
+------------------+------------------+
|1    John Doe  025|{1, John Doe, 25} |
|2    Jane Doe  030|{2, Jane Doe, 30} |
|3    Jim Smith 035|{3, Jim Smith, 35}|
|4    Ann Brown 040|{4, Ann Brown, 40}|
|5    Tom Davis 045|{5, Tom Davis, 45}|
+------------------+------------------+



In [63]:
# Now create a df with just the parsed values
df = df.select(col("parsed.*"))
df.show(truncate=False)

+---+---------+---+
|id |name     |age|
+---+---------+---+
|1  |John Doe |25 |
|2  |Jane Doe |30 |
|3  |Jim Smith|35 |
|4  |Ann Brown|40 |
|5  |Tom Davis|45 |
+---+---------+---+

