In [1]:
import avro.schema

In [2]:
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter

schema = avro.schema.Parse(open('user.avsc', "r").read())

print(schema)

writer = DataFileWriter(open("users.avro", "wb"), DatumWriter(), schema)
writer.append({"name": "Alyssa", "favorite_number": 256})
writer.append({"name": "Ben", "favorite_number": 7, "favorite_color": "red"})
writer.close()

reader = DataFileReader(open("users.avro", "rb"), DatumReader())
for user in reader:
    print(user)
reader.close()

{"type": "record", "name": "User", "namespace": "example.avro", "fields": [{"type": "string", "name": "name"}, {"type": ["int", "null"], "name": "favorite_number"}, {"type": ["string", "null"], "name": "favorite_color"}]}
{'name': 'Alyssa', 'favorite_number': 256, 'favorite_color': None}
{'name': 'Ben', 'favorite_number': 7, 'favorite_color': 'red'}


In [3]:
# Connect to Spark by creating a Spark session
from pyspark.sql import SparkSession
import os

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

spark = SparkSession\
    .builder\
    .appName("Avro")\
    .getOrCreate()

In [4]:
df = spark.read.format("avro").load("users.avro")
df.show()

+------+---------------+--------------+
|  name|favorite_number|favorite_color|
+------+---------------+--------------+
|Alyssa|            256|          null|
|   Ben|              7|           red|
+------+---------------+--------------+



In [5]:
df.select("name", "favorite_color").write.mode("overwrite").format("avro").save("namesAndFavColors.avro")

In [6]:
df = spark.read.format("avro").load("namesAndFavColors.avro")
df.show()

+------+--------------+
|  name|favorite_color|
+------+--------------+
|Alyssa|          null|
|   Ben|           red|
+------+--------------+



In [8]:
data =[(2012, 8, "Batman", 9.8),
    (2012, 8, "Hero", 8.7),
    (2012, 7, "Robot", 5.5),
    (2011, 7, "Git", 2.0)]
    
df = spark.createDataFrame(data, ["year", "month", "title", "rating"])
df.show()
schema = avro.schema.Parse(open('rating.avsc', "r").read())
df.write.mode("overwrite").partitionBy("year", "month").format("avro").save("rating")


+----+-----+------+------+
|year|month| title|rating|
+----+-----+------+------+
|2012|    8|Batman|   9.8|
|2012|    8|  Hero|   8.7|
|2012|    7| Robot|   5.5|
|2011|    7|   Git|   2.0|
+----+-----+------+------+



In [9]:
df.printSchema()

root
 |-- year: long (nullable = true)
 |-- month: long (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: double (nullable = true)

