The aim of this notebook is to express the semantics of schema evolution of parquet files using both Spark and parquet2hive + Presto. Note that all parquet2hive is doing is reading the schema *from the most recently created file*, so in some cases this could be changed without changing Presto's underlying facilities by reading *all* files.

In this notebook I am running Spark locally, and using a remote Presto cluster. To connect to this cluster I'm using parquet2hive_server [0], which is just a simple API for parquet2hive on the remote cluster. To run this notebook successfully, you'll need the server running on the cluster, and have the correct hostname in parquet2hive_server/settings.py.

[0] http://www.github.com/fbertsch/parquet2hive_server

In [90]:
from parquet2hive_server.client import Parquet2HiveClient
from pyhive import presto
from pprint import pprint

client = Parquet2HiveClient()

In [91]:
dataset = "s3://telemetry-test-bucket/schema_evolution/"
partition = '/type='

In [92]:
conn = presto.connect(host='ec2-54-214-108-235.us-west-2.compute.amazonaws.com', port='8889')
cursor = conn.cursor()

def get_schema(_cursor, _v):
    """Prints the schema in a similar format to spark's dataframe.printSchema()"""
    _cursor.execute('describe schema_evolution_{}'.format(_v))
    return '\\\n'.join(['root'] + [' |-- {}: {}'.format(a, b) for a, b, c in _cursor.fetchall()])

def execute(_cursor, _query):
    _cursor.execute(_query)
    results = _cursor.fetchall()
    colnames = [c[0] for c in _cursor.description]
    return '\\\n'.join([', '.join(['{}={}'.format(c,r) for c,r in zip(colnames, res)]) for res in results])

## Adding a Column

In [93]:
v = 'v1'

In [94]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [95]:
rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '2')

### Spark

In [96]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [97]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [98]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.filter(df['type'] == '1').select('score').collect()

[Row(score=None), Row(score=None)]

### Presto

In [99]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [100]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- score: bigint\
 |-- type: varchar


In [101]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=0, score=None, type=1\
id=1, score=None, type=1\
id=0, score=0, type=2\
id=1, score=1, type=2


## Removing a Column

In [102]:
v = 'v2'

rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '1')

In [103]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [104]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [105]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [106]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('score').collect()

[Row(score=None), Row(score=None)]

In [107]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('score').collect()

[Row(score=None), Row(score=None)]

### Presto

In [108]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [109]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- type: varchar


In [110]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=0, type=2\
id=1, type=2\
id=0, type=1\
id=1, type=1


## Renaming a Column

In [111]:
v = 'v3'

In [112]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [113]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [114]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [115]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [116]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('id').collect()

[Row(id=None), Row(id=None)]

In [117]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.collect()

[Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1'),
 Row(id=None, score=0, type=u'2'),
 Row(id=None, score=1, type=u'2')]

### Presto

In [118]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [119]:
print get_schema(cursor, v)

root\
 |-- score: bigint\
 |-- type: varchar


In [120]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

score=None, type=1\
score=None, type=1\
score=0, type=2\
score=1, type=2


### Replace Column

Note that this is similar to "rename column", but the new data has a different type.

In [121]:
v = 'v4'

rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [122]:
rdd = sc.parallelize([('a',),('b',)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [123]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [124]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: string (nullable = true)
 |-- type: string (nullable = true)



In [125]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('id').collect()

[Row(id=None), Row(id=None)]

In [126]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.collect()

[Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1'),
 Row(id=None, score=u'a', type=u'2'),
 Row(id=None, score=u'b', type=u'2')]

### Presto

In [127]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [128]:
print get_schema(cursor, v)

root\
 |-- score: varchar\
 |-- type: varchar


In [129]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

score=a, type=2\
score=b, type=2\
score=None, type=1\
score=None, type=1


## Tranpose Columns

In [130]:
v = 'v5'

rdd = sc.parallelize([(0,'a','b')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_a', 'transpose_b'])
df.write.parquet(dataset + v + partition + '1')

In [131]:
rdd = sc.parallelize([(1,'b','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_b', 'transpose_a'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [132]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [133]:
sqlContext.read.load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

In [134]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [135]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

### Presto

In [136]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [137]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transpose_b: varchar\
 |-- transpose_a: varchar\
 |-- type: varchar


In [138]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=1, transpose_b=b, transpose_a=a, type=2\
id=0, transpose_b=b, transpose_a=a, type=1


## Transpose, Delete and Add Columns

In [139]:
v = 'v6'

rdd = sc.parallelize([(0,'r','t')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'removed', 'transposed'])
df.write.parquet(dataset + v + partition + '1')

In [140]:
rdd = sc.parallelize([(1,'t','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transposed', 'added'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [141]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- type: string (nullable = true)



In [142]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', type=u'1'),
 Row(id=1, removed=None, transposed=u't', type=u'2')]

In [143]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- added: string (nullable = true)
 |-- type: string (nullable = true)



In [144]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', added=None, type=u'1'),
 Row(id=1, removed=None, transposed=u't', added=u'a', type=u'2')]

### Presto

In [145]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [146]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transposed: varchar\
 |-- added: varchar\
 |-- type: varchar


In [147]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=0, transposed=t, added=None, type=1\
id=1, transposed=t, added=a, type=2
