## Struct Vs Map Vs Array

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
# Initialize Spark Session
spark = SparkSession.builder.appName("datatype").getOrCreate()
sc = spark.sparkContext

### StructType
- Like a mini table
- Requires fixed schema - each row has same set of fields

In [29]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("Person", StructType([
        StructField("Name", StringType(), True),
        StructField("Age", IntegerType(), True)
    ]), True),
    StructField("City", StringType(), True)
])

data = [({"Name": "Alice", "Age": 30}, "New York"),
        ({"Name": "Bob", "Age": 25}, "Los Angeles")]

df = spark.createDataFrame(data, schema)

In [16]:
df.show(truncate=False)
df.printSchema()

+-----------+-----------+
|Person     |City       |
+-----------+-----------+
|{Alice, 30}|New York   |
|{Bob, 25}  |Los Angeles|
+-----------+-----------+

root
 |-- Person: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)



In [32]:
# Accessing the nested fields
df.select("Person.Name", "Person.Age", "City").show()

+-----+---+-----------+
| Name|Age|       City|
+-----+---+-----------+
|Alice| 30|   New York|
|  Bob| 25|Los Angeles|
+-----+---+-----------+



In [33]:
df.select(col("Person.Name").alias("Name"), col("Person.Age").alias("Age"), "City").show()

+-----+---+-----------+
| Name|Age|       City|
+-----+---+-----------+
|Alice| 30|   New York|
|  Bob| 25|Los Angeles|
+-----+---+-----------+



In [31]:
df.select(col("Person").getItem('Name').alias("Name"), col("Person").getItem('Age').alias("Age"), "City").show()

+-----+---+-----------+
| Name|Age|       City|
+-----+---+-----------+
|Alice| 30|   New York|
|  Bob| 25|Los Angeles|
+-----+---+-----------+



Checking if struct fields can vary across rows

In [13]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
import datetime
schema = StructType([
    StructField("Person", StructType([
        StructField("Name", StringType(), True),
        StructField("Age", IntegerType(), True),
        StructField("Birthdate",DateType(), True)
    ]), True),
    StructField("City", StringType(), True)
])

data = [({"Name": "Alice", "Age": 30,"Birthdate": datetime.date(1994, 5, 10)}, "New York"),
        ({"Name": "Bob", "Age": 25}, "Los Angeles")]

df = spark.createDataFrame(data, schema)

In [14]:
df.show()

+--------------------+-----------+
|              Person|       City|
+--------------------+-----------+
|{Alice, 30, 1994-...|   New York|
|     {Bob, 25, NULL}|Los Angeles|
+--------------------+-----------+



### MapType

In [35]:
from pyspark.sql.types import MapType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Attributes", MapType(StringType(), StringType()), True)
])

data = [
    ("Alice", {"height": "5.5", "weight": "60"}),
    ("Bob", {"height": "6.0", "weight": "70", "hobby": "cycling"})
]
df = spark.createDataFrame(data, schema)

df.show(truncate=False)
df.printSchema()

+-----+-----------------------------------------------+
|Name |Attributes                                     |
+-----+-----------------------------------------------+
|Alice|{weight -> 60, height -> 5.5}                  |
|Bob  |{weight -> 70, hobby -> cycling, height -> 6.0}|
+-----+-----------------------------------------------+

root
 |-- Name: string (nullable = true)
 |-- Attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [36]:
# Accessing the map values
df.select("Name", col("Attributes.height").alias("Height"), col("Attributes.weight").alias("Weight")).show()

+-----+------+------+
| Name|Height|Weight|
+-----+------+------+
|Alice|   5.5|    60|
|  Bob|   6.0|    70|
+-----+------+------+



In [37]:

df.select("Name", col("Attributes").getItem("height").alias("Height"), col("Attributes").getItem("weight").alias("Weight")).show()

+-----+------+------+
| Name|Height|Weight|
+-----+------+------+
|Alice|   5.5|    60|
|  Bob|   6.0|    70|
+-----+------+------+



### ArrayType

In [25]:
from pyspark.sql.types import ArrayType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Scores", ArrayType(IntegerType()), True)
])


data = [
    ("Alice", [85, 90, 88]),
    ("Bob", [78, 82, 84])
]
df = spark.createDataFrame(data, schema)

df.show(truncate=False)
df.printSchema()

+-----+------------+
|Name |Scores      |
+-----+------------+
|Alice|[85, 90, 88]|
|Bob  |[78, 82, 84]|
+-----+------------+

root
 |-- Name: string (nullable = true)
 |-- Scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [26]:
# Accessing array elements
df.select("Name", col("Scores")[0].alias("FirstScore"), col("Scores")[1].alias("SecondScore")).show()

# Explode the array to access individual elements
df.select("Name", explode("Scores").alias("Score")).show()

+-----+----------+-----------+
| Name|FirstScore|SecondScore|
+-----+----------+-----------+
|Alice|        85|         90|
|  Bob|        78|         82|
+-----+----------+-----------+

+-----+-----+
| Name|Score|
+-----+-----+
|Alice|   85|
|Alice|   90|
|Alice|   88|
|  Bob|   78|
|  Bob|   82|
|  Bob|   84|
+-----+-----+



In [27]:
from pyspark.sql.types import ArrayType

schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Scores", ArrayType(IntegerType()), True)
])


data = [
    ("Alice", [85, 90, 88]),
    ("Bob", [78, 82])
]
df = spark.createDataFrame(data, schema)

df.show(truncate=False)
df.printSchema()

+-----+------------+
|Name |Scores      |
+-----+------------+
|Alice|[85, 90, 88]|
|Bob  |[78, 82]    |
+-----+------------+

root
 |-- Name: string (nullable = true)
 |-- Scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [38]:
spark.stop()