#### **Struct**

- used to create a **new column** of type **StructType** by **combining multiple columns into a single struct column**.

- you were given two columns and want to change column values from a **flat structure** to a **nested column structure**.

- **Combining Columns:**
  - The struct function can **combine multiple columns** into a **single struct column**.

- **Nested Structures:**
  - It allows for the creation of **nested structures**, which can be useful for **organizing related data**.
  - when you want to **group** related columns together into a **single column with a nested structure**.

**Syntax:**

     struct()
     struct(*columns) --> columns (list, set, str or column)

     from pyspark.sql.functions import struct, col
 
     # Method 1:
     df = df.select(struct("f_name", "l_name").alias("name"))
 
     # Method 2:
     df = df.select(struct(["f_name", "l_name"]).alias("name"))
 
     # Method 3:
     df = df.select(struct([col("f_name"), col("l_name")]).alias("name"))
 
     # Method 4:
     columns = ("f_name", "l_name")
     df = df.select(struct(*columns).alias("name"))

     df = df.withColumn("name", struct("f_name", "l_name"))

In [0]:
df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))
display(df)
df.printSchema()

name,age
Alice,2
Bob,5


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [0]:
df.select(struct('age', 'name').alias("struct")).collect()

Out[17]: [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]

#### **How to convert flat structure to a nested column structure**

In [0]:
from pyspark.sql.functions import struct, col, from_json, to_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as f

In [0]:
data = [(1, "Marry", "Terissa"), (2, "Kapil", "Sharma"), (3, "Niraj", "Gupta"), (4, "Amit", "Jain")]

schema = StructType([StructField("S.No", IntegerType(), False),
                     StructField("First_Name", StringType(), False),
                     StructField("Last_Name", StringType(), False)])

dff = spark.createDataFrame(data, schema=schema)
display(dff)

S.No,First_Name,Last_Name
1,Marry,Terissa
2,Kapil,Sharma
3,Niraj,Gupta
4,Amit,Jain


In [0]:
# using withColumn method
df1 = dff.withColumn("Name", struct("First_Name", "Last_Name"))
display(df1)

S.No,First_Name,Last_Name,Name
1,Marry,Terissa,"List(Marry, Terissa)"
2,Kapil,Sharma,"List(Kapil, Sharma)"
3,Niraj,Gupta,"List(Niraj, Gupta)"
4,Amit,Jain,"List(Amit, Jain)"


In [0]:
# using Select method 01
df2 = dff.select("*", struct("First_Name", "Last_Name").alias("Name"))
display(df2)

S.No,First_Name,Last_Name,Name
1,Marry,Terissa,"List(Marry, Terissa)"
2,Kapil,Sharma,"List(Kapil, Sharma)"
3,Niraj,Gupta,"List(Niraj, Gupta)"
4,Amit,Jain,"List(Amit, Jain)"


In [0]:
# using Select method 02
df3 = dff.select("*", struct(["First_Name", "Last_Name"]).alias("Name"))
display(df3)

S.No,First_Name,Last_Name,Name
1,Marry,Terissa,"List(Marry, Terissa)"
2,Kapil,Sharma,"List(Kapil, Sharma)"
3,Niraj,Gupta,"List(Niraj, Gupta)"
4,Amit,Jain,"List(Amit, Jain)"


In [0]:
# using Select method 03
df4 = dff.select("*", struct([col("First_Name"), col("Last_Name")]).alias("Name"))
display(df4)

S.No,First_Name,Last_Name,Name
1,Marry,Terissa,"List(Marry, Terissa)"
2,Kapil,Sharma,"List(Kapil, Sharma)"
3,Niraj,Gupta,"List(Niraj, Gupta)"
4,Amit,Jain,"List(Amit, Jain)"


In [0]:
# using Select method 04
cols = ("First_Name", "Last_Name")
df5 = dff.select("*", struct(*cols).alias("Name"))
display(df5)

S.No,First_Name,Last_Name,Name
1,Marry,Terissa,"List(Marry, Terissa)"
2,Kapil,Sharma,"List(Kapil, Sharma)"
3,Niraj,Gupta,"List(Niraj, Gupta)"
4,Amit,Jain,"List(Amit, Jain)"


#### **How to convert StructType column into StringType using to_json?**

In [0]:
df = spark.read.csv("/FileStore/tables/StructType-5.csv", header=True, inferSchema=True)
display(df)

Id,Nick_Name,First_Name,Last_Name,Type,Age
1,admin,John,Victor,Grade1,30
2,everest,Paul,Irish,Grade2,35
3,moon,Erram,Rammohan,Enginner1,29
4,service,Stalin,Rajesh,Minister,40
5,Builder,Golla,Rajasekar,Builder,43
6,Drinker,Karjala,Hari,Army,33
7,Army,Koyi,Damodar,Bettalian,37
8,Marketing,Vemparla,Harish,Manager,55
9,Politician,Devineni,Umesh,Senior,58
10,Minister,Ponguru,Narayana,Education,56


In [0]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Nick_Name: string (nullable = true)
 |-- First_Name: string (nullable = true)
 |-- Last_Name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Age: integer (nullable = true)



     # Select all columns and structure them into a single column named 'Sales'
     df_stru = df_json.select(f.struct('*').alias('sales_msg')).distinct()
     display(df_stru.limit(10))
                                (or)
     # Select all columns and structure them into a single column named 'Sales'
     sales_df_final = df.select(f.struct(
                                f.col('Id'),
                                f.col('Nick_Name'),
                                f.col('First_Name'),
                                f.col('Last_Name'),
                                f.col('Type'),
                                f.col('Age')
                                ).alias('Sales')
                              )

     display(sales_df_final)

In [0]:
# Select all columns and structure them into a single column named 'Sales'
sales_df_final = df.select(f.struct(
                     f.col('Id'),
                     f.col('Nick_Name'),
                     f.col('First_Name'),
                     f.col('Last_Name'),
                     f.col('Type'),
                     f.col('Age')
                     ).alias('Sales')
                     )

display(sales_df_final)

Sales
"List(1, admin, John, Victor, Grade1, 30)"
"List(2, everest, Paul, Irish, Grade2, 35)"
"List(3, moon, Erram, Rammohan, Enginner1, 29)"
"List(4, service, Stalin, Rajesh, Minister, 40)"
"List(5, Builder, Golla, Rajasekar, Builder, 43)"
"List(6, Drinker, Karjala, Hari, Army, 33)"
"List(7, Army, Koyi, Damodar, Bettalian, 37)"
"List(8, Marketing, Vemparla, Harish, Manager, 55)"
"List(9, Politician, Devineni, Umesh, Senior, 58)"
"List(10, Minister, Ponguru, Narayana, Education, 56)"


In [0]:
# Convert the 'Sales' column to JSON string
df_final = sales_df_final.withColumn('message', f.to_json('Sales'))
df_final.display()

Sales,message
"List(1, admin, John, Victor, Grade1, 30)","{""Id"":1,""Nick_Name"":""admin"",""First_Name"":""John"",""Last_Name"":""Victor"",""Type"":""Grade1"",""Age"":30}"
"List(2, everest, Paul, Irish, Grade2, 35)","{""Id"":2,""Nick_Name"":""everest"",""First_Name"":""Paul"",""Last_Name"":""Irish"",""Type"":""Grade2"",""Age"":35}"
"List(3, moon, Erram, Rammohan, Enginner1, 29)","{""Id"":3,""Nick_Name"":""moon"",""First_Name"":""Erram"",""Last_Name"":""Rammohan"",""Type"":""Enginner1"",""Age"":29}"
"List(4, service, Stalin, Rajesh, Minister, 40)","{""Id"":4,""Nick_Name"":""service"",""First_Name"":""Stalin"",""Last_Name"":""Rajesh"",""Type"":""Minister"",""Age"":40}"
"List(5, Builder, Golla, Rajasekar, Builder, 43)","{""Id"":5,""Nick_Name"":""Builder"",""First_Name"":""Golla"",""Last_Name"":""Rajasekar"",""Type"":""Builder"",""Age"":43}"
"List(6, Drinker, Karjala, Hari, Army, 33)","{""Id"":6,""Nick_Name"":""Drinker"",""First_Name"":""Karjala"",""Last_Name"":""Hari"",""Type"":""Army"",""Age"":33}"
"List(7, Army, Koyi, Damodar, Bettalian, 37)","{""Id"":7,""Nick_Name"":""Army"",""First_Name"":""Koyi"",""Last_Name"":""Damodar"",""Type"":""Bettalian"",""Age"":37}"
"List(8, Marketing, Vemparla, Harish, Manager, 55)","{""Id"":8,""Nick_Name"":""Marketing"",""First_Name"":""Vemparla"",""Last_Name"":""Harish"",""Type"":""Manager"",""Age"":55}"
"List(9, Politician, Devineni, Umesh, Senior, 58)","{""Id"":9,""Nick_Name"":""Politician"",""First_Name"":""Devineni"",""Last_Name"":""Umesh"",""Type"":""Senior"",""Age"":58}"
"List(10, Minister, Ponguru, Narayana, Education, 56)","{""Id"":10,""Nick_Name"":""Minister"",""First_Name"":""Ponguru"",""Last_Name"":""Narayana"",""Type"":""Education"",""Age"":56}"


In [0]:
df_schema = StructType([StructField('Id', IntegerType(), False),
                        StructField('Nick_Name', StringType(), False),
                        StructField('First_Name', StringType(), False),
                        StructField('Last_Name', StringType(), False),
                        StructField('Type', StringType(), False),
                        StructField('Age', IntegerType(), False)
                        ])

In [0]:
# Apply the from_json function on the JSON string column
df_final = df_final.select(f.from_json('message', df_schema).alias('kafka_message'))
display(df_final)

kafka_message
"List(1, admin, John, Victor, Grade1, 30)"
"List(2, everest, Paul, Irish, Grade2, 35)"
"List(3, moon, Erram, Rammohan, Enginner1, 29)"
"List(4, service, Stalin, Rajesh, Minister, 40)"
"List(5, Builder, Golla, Rajasekar, Builder, 43)"
"List(6, Drinker, Karjala, Hari, Army, 33)"
"List(7, Army, Koyi, Damodar, Bettalian, 37)"
"List(8, Marketing, Vemparla, Harish, Manager, 55)"
"List(9, Politician, Devineni, Umesh, Senior, 58)"
"List(10, Minister, Ponguru, Narayana, Education, 56)"
