# Schema Evolution

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable

conf = SparkConf()

conf.setAppName("Sample Schema Evolution")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://172.21.121.140:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

In [2]:
data2 = [("James", "Smith", "M", 3000),
         ("Michael", "Rose", "M", 6000),
         ("Robert", "Willians", "M", 5500),
         ("Maria", "Anne", "F", 7000)
        ]

schema = StructType([
    StructField("firsname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", StringType(), True)
])

df = spark.createDataFrame(data=data2, schema=schema)

df.show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|   James|   Smith|     M|  3000|
| Michael|    Rose|     M|  6000|
|  Robert|Willians|     M|  5500|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



In [4]:
df.write.format("delta").mode("append").save('s3a://bronze/tb_schema_evolution')

In [6]:
df = spark.read.format("delta").load('s3a://bronze/tb_schema_evolution').show()

+--------+--------+------+------+
|firsname|lastname|gender|salary|
+--------+--------+------+------+
|  Robert|Willians|     M|  5500|
| Michael|    Rose|     M|  6000|
|   James|   Smith|     M|  3000|
|   Maria|    Anne|     F|  7000|
+--------+--------+------+------+



## Add new data with new schema

In [3]:
new_data = [("Wallace", "Camargo", "M", 3000, 35),
        ]

new_schema = StructType([
    StructField("firsname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", StringType(), True),
    StructField("age", IntegerType(), True),
    
])

df_new = spark.createDataFrame(data=new_data, schema=new_schema)

df_new.show()

+--------+--------+------+------+---+
|firsname|lastname|gender|salary|age|
+--------+--------+------+------+---+
| Wallace| Camargo|     M|  3000| 35|
+--------+--------+------+------+---+



## Write data with schema evolution 

In [8]:
df_new.write.format("delta").mode("append").option("mergeSchema", "true").save('s3a://bronze/tb_schema_evolution')

In [9]:
df = spark.read.format("delta").load('s3a://bronze/tb_schema_evolution').show()

+--------+--------+------+------+----+
|firsname|lastname|gender|salary| age|
+--------+--------+------+------+----+
| Wallace| Camargo|     M|  3000|  35|
|  Robert|Willians|     M|  5500|null|
| Michael|    Rose|     M|  6000|null|
|   James|   Smith|     M|  3000|null|
|   Maria|    Anne|     F|  7000|null|
+--------+--------+------+------+----+

