#### **to_avro**

- to **encode** a column as **binary in Avro format**.
- **to_avro** is used to **convert** a Spark **DataFrame column** to **Avro binary format**.
- can be used to turn **structs into Avro records**. This method is particularly useful when you would like to **re-encode multiple columns into a single** one when writing data out to Kafka.

     # Encode the column `name` in Avro format.
     output = df.select(to_avro("user.name").alias("value"))

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.avro.functions import to_avro, from_avro

In [0]:
data = [
    (1, "Arjun", 50000, "Nasik"),
    (2, "Bibin", 60000, "San Francisco"),
    (3, "Charu", 70000, "Los Angeles"),
    (4, "Anand", 55500, "Chennai"),
    (5, "Bhaskar", 65550, "Pondichery"),
    (6, "Chandan", 85500, "Bangalore"),
    (7, "Sai", 25000, "Hyderabad"),
    (8, "Rakesh", 35000, "Delhi"),
    (9, "Kiran", 95000, "Amaravati"),
    (10, "Rajini", 45000, "Dubai")
]

columns = ["id", "name", "salary", "city"]

df = spark.createDataFrame(data, columns)
display(df)

id,name,salary,city
1,Arjun,50000,Nasik
2,Bibin,60000,San Francisco
3,Charu,70000,Los Angeles
4,Anand,55500,Chennai
5,Bhaskar,65550,Pondichery
6,Chandan,85500,Bangalore
7,Sai,25000,Hyderabad
8,Rakesh,35000,Delhi
9,Kiran,95000,Amaravati
10,Rajini,45000,Dubai


**Convert a Column to Avro Format**
- The Avro-encoded **binary data** is shown in a 
**readable format**.

In [0]:
# to_avro to serialize the entire row into Avro format
df_avro_col = df.withColumn("encode_col", to_avro(df["name"]))
display(df_avro_col)

id,name,salary,city,encode_col
1,Arjun,50000,Nasik,AApBcmp1bg==
2,Bibin,60000,San Francisco,AApCaWJpbg==
3,Charu,70000,Los Angeles,AApDaGFydQ==
4,Anand,55500,Chennai,AApBbmFuZA==
5,Bhaskar,65550,Pondichery,AA5CaGFza2Fy
6,Chandan,85500,Bangalore,AA5DaGFuZGFu
7,Sai,25000,Hyderabad,AAZTYWk=
8,Rakesh,35000,Delhi,AAxSYWtlc2g=
9,Kiran,95000,Amaravati,AApLaXJhbg==
10,Rajini,45000,Dubai,AAxSYWppbmk=


**Convert Entire Row into Avro Format**
- Instead of just one column, you can convert the **entire row into Avro**.
- **struct("*")** => Converts **all columns** into a **single Avro-encoded structure**.

In [0]:
from pyspark.sql.functions import struct

df_avro_row = df.withColumn("encode_row", to_avro(struct("*")))
display(df_avro_row)

id,name,salary,city,encode_row
1,Arjun,50000,Nasik,AAIACkFyanVuAKCNBgAKTmFzaWs=
2,Bibin,60000,San Francisco,AAQACkJpYmluAMCpBwAaU2FuIEZyYW5jaXNjbw==
3,Charu,70000,Los Angeles,AAYACkNoYXJ1AODFCAAWTG9zIEFuZ2VsZXM=
4,Anand,55500,Chennai,AAgACkFuYW5kAJjjBgAOQ2hlbm5haQ==
5,Bhaskar,65550,Pondichery,AAoADkJoYXNrYXIAnIAIABRQb25kaWNoZXJ5
6,Chandan,85500,Bangalore,AAwADkNoYW5kYW4A+LcKABJCYW5nYWxvcmU=
7,Sai,25000,Hyderabad,AA4ABlNhaQDQhgMAEkh5ZGVyYWJhZA==
8,Rakesh,35000,Delhi,ABAADFJha2VzaADwogQACkRlbGhp
9,Kiran,95000,Amaravati,ABIACktpcmFuALDMCwASQW1hcmF2YXRp
10,Rajini,45000,Dubai,ABQADFJhamluaQCQvwUACkR1YmFp


In [0]:
df_avro_row.write.format("avro").mode("overwrite").save("/FileStore/tables/avro/employee_avro_data")

In [0]:
%fs ls /FileStore/tables/avro/employee_avro_data

path,name,size,modificationTime
dbfs:/FileStore/tables/avro/employee_avro_data/_committed_4540589104069149071,_committed_4540589104069149071,1567,1740155537000
dbfs:/FileStore/tables/avro/employee_avro_data/_committed_5384811143179424437,_committed_5384811143179424437,1559,1740153605000
dbfs:/FileStore/tables/avro/employee_avro_data/_committed_8514726851004921813,_committed_8514726851004921813,1570,1740149810000
dbfs:/FileStore/tables/avro/employee_avro_data/_started_4540589104069149071,_started_4540589104069149071,0,1740155536000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00000-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-154-1-c000.snappy.avro,part-00000-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-154-1-c000.snappy.avro,382,1740155537000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00001-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-155-1-c000.snappy.avro,part-00001-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-155-1-c000.snappy.avro,390,1740155536000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00002-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-156-1-c000.snappy.avro,part-00002-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-156-1-c000.snappy.avro,388,1740155537000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00003-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-157-1-c000.snappy.avro,part-00003-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-157-1-c000.snappy.avro,417,1740155536000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00004-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-158-1-c000.snappy.avro,part-00004-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-158-1-c000.snappy.avro,388,1740155536000
dbfs:/FileStore/tables/avro/employee_avro_data/part-00005-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-159-1-c000.snappy.avro,part-00005-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-159-1-c000.snappy.avro,385,1740155536000


In [0]:
df_read = spark.read.format("avro").load("/FileStore/tables/avro/employee_avro_data")
display(df_read)

id,name,salary,city,encode_row
4,Anand,55500,Chennai,AAgACkFuYW5kAJjjBgAOQ2hlbm5haQ==
5,Bhaskar,65550,Pondichery,AAoADkJoYXNrYXIAnIAIABRQb25kaWNoZXJ5
9,Kiran,95000,Amaravati,ABIACktpcmFuALDMCwASQW1hcmF2YXRp
10,Rajini,45000,Dubai,ABQADFJhamluaQCQvwUACkR1YmFp
2,Bibin,60000,San Francisco,AAQACkJpYmluAMCpBwAaU2FuIEZyYW5jaXNjbw==
3,Charu,70000,Los Angeles,AAYACkNoYXJ1AODFCAAWTG9zIEFuZ2VsZXM=
6,Chandan,85500,Bangalore,AAwADkNoYW5kYW4A+LcKABJCYW5nYWxvcmU=
7,Sai,25000,Hyderabad,AA4ABlNhaQDQhgMAEkh5ZGVyYWJhZA==
8,Rakesh,35000,Delhi,ABAADFJha2VzaADwogQACkRlbGhp
1,Arjun,50000,Nasik,AAIACkFyanVuAKCNBgAKTmFzaWs=


In [0]:
df_read1 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00000-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-154-1-c000.snappy.avro")
display(df_read1)

df_read2 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00001-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-155-1-c000.snappy.avro")
display(df_read2)

df_read3 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00002-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-156-1-c000.snappy.avro")
display(df_read3)

df_read4 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00003-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-157-1-c000.snappy.avro")
display(df_read4)

df_read5 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00004-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-158-1-c000.snappy.avro")
display(df_read5)

df_read6 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00005-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-159-1-c000.snappy.avro")
display(df_read6)

df_read7 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00006-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-160-1-c000.snappy.avro")
display(df_read7)

df_read8 = spark.read.format("avro").load("dbfs:/FileStore/tables/avro/employee_avro_data/part-00007-tid-4540589104069149071-0300496f-0f56-4989-b549-d83183941916-161-1-c000.snappy.avro")
display(df_read8)

id,name,salary,city,encode_row
1,Arjun,50000,Nasik,AAIACkFyanVuAKCNBgAKTmFzaWs=


id,name,salary,city,encode_row
2,Bibin,60000,San Francisco,AAQACkJpYmluAMCpBwAaU2FuIEZyYW5jaXNjbw==


id,name,salary,city,encode_row
3,Charu,70000,Los Angeles,AAYACkNoYXJ1AODFCAAWTG9zIEFuZ2VsZXM=


id,name,salary,city,encode_row
4,Anand,55500,Chennai,AAgACkFuYW5kAJjjBgAOQ2hlbm5haQ==
5,Bhaskar,65550,Pondichery,AAoADkJoYXNrYXIAnIAIABRQb25kaWNoZXJ5


id,name,salary,city,encode_row
6,Chandan,85500,Bangalore,AAwADkNoYW5kYW4A+LcKABJCYW5nYWxvcmU=


id,name,salary,city,encode_row
7,Sai,25000,Hyderabad,AA4ABlNhaQDQhgMAEkh5ZGVyYWJhZA==


id,name,salary,city,encode_row
8,Rakesh,35000,Delhi,ABAADFJha2VzaADwogQACkRlbGhp


id,name,salary,city,encode_row
9,Kiran,95000,Amaravati,ABIACktpcmFuALDMCwASQW1hcmF2YXRp
10,Rajini,45000,Dubai,ABQADFJhamluaQCQvwUACkR1YmFp
