The aim of this notebook is to express the semantics of schema evolution of parquet files using both Spark and parquet2hive + Presto. Note that all parquet2hive is doing is reading the schema *from the most recently created file*, so in some cases this could be changed without changing Presto's underlying facilities by reading *all* files.

In this notebook I am running Spark locally, and using a remote Presto cluster. To connect to this cluster I'm using parquet2hive_server [0], which is just a simple API for parquet2hive on the remote cluster. To run this notebook successfully, you'll need the server running on the cluster, and have the correct hostname in parquet2hive_server/settings.py.

[0] http://www.github.com/fbertsch/parquet2hive_server

In [1]:
from parquet2hive_server.client import Parquet2HiveClient
from pyhive import presto
from pprint import pprint

client = Parquet2HiveClient()

In [2]:
dataset = "s3://telemetry-test-bucket/schema_evolution/"
partition = '/type='

In [3]:
conn = presto.connect(host='ec2-54-218-111-221.us-west-2.compute.amazonaws.com', port='8889')
cursor = conn.cursor()

def get_schema(_cursor, _v):
    """Prints the schema in a similar format to spark's dataframe.printSchema()"""
    _cursor.execute('describe schema_evolution_{}'.format(_v))
    return '\\\n'.join(['root'] + [' |-- {}: {}'.format(a, b) for a, b, c in _cursor.fetchall()])

## Adding a Column

In [4]:
v = 'v1'

rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [8]:
rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '2')

### Spark

In [9]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [10]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [11]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.filter(df['type'] == '1').select('score').collect()

[Row(score=None), Row(score=None)]

### Presto

In [12]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [13]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- score: bigint\
 |-- type: varchar


In [14]:
cursor.execute("SELECT score FROM schema_evolution_{} WHERE type = '1'".format(v))
cursor.fetchall()

[(None,), (None,)]

## Removing a Column

In [15]:
v = 'v2'

rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '1')

In [16]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [17]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [18]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [19]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('score').collect()

[Row(score=None), Row(score=None)]

In [20]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('score').collect()

[Row(score=None), Row(score=None)]

### Presto

In [21]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [22]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- type: varchar


In [23]:
cursor.execute("SELECT * FROM schema_evolution_{} WHERE type = '1'".format(v))
cursor.fetchall()

[(0, u'1'), (1, u'1')]

## Renaming a Column

In [24]:
v = 'v3'

rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [25]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [26]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [27]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [28]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('id').collect()

[Row(id=None), Row(id=None)]

In [29]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.collect()

[Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1'),
 Row(id=None, score=0, type=u'2'),
 Row(id=None, score=1, type=u'2')]

### Presto

In [30]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [31]:
print get_schema(cursor, v)

root\
 |-- score: bigint\
 |-- type: varchar


In [32]:
cursor.execute("SELECT * FROM schema_evolution_{}".format(v))
cursor.fetchall()

[(0, u'2'), (1, u'2'), (0, u'1'), (1, u'1')]

### Replace Column

Note that this is similar to "rename column", but the new data has a different type.

In [33]:
v = 'v4'

rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [34]:
rdd = sc.parallelize([('a',),('b',)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [35]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [36]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: string (nullable = true)
 |-- type: string (nullable = true)



In [37]:
df = sqlContext.read.load(dataset + v, 'parquet')
df.filter(df['type'] == '2').select('id').collect()

[Row(id=None), Row(id=None)]

In [38]:
df = sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet')
df.collect()

[Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1'),
 Row(id=None, score=u'a', type=u'2'),
 Row(id=None, score=u'b', type=u'2')]

### Presto

In [39]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [40]:
print get_schema(cursor, v)

root\
 |-- score: varchar\
 |-- type: varchar


In [41]:
cursor.execute("SELECT score FROM schema_evolution_{} WHERE type = '1'".format(v))
cursor.fetchall()

DatabaseError: {u'errorCode': 65536, u'errorType': u'INTERNAL_ERROR', u'failureInfo': {u'suppressed': [], u'type': u'java.lang.NullPointerException', u'stack': [u'com.facebook.presto.spi.RecordPageSource.getNextPage(RecordPageSource.java:124)', u'com.facebook.presto.operator.TableScanOperator.getOutput(TableScanOperator.java:246)', u'com.facebook.presto.operator.Driver.processInternal(Driver.java:378)', u'com.facebook.presto.operator.Driver.processFor(Driver.java:301)', u'com.facebook.presto.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:618)', u'com.facebook.presto.execution.TaskExecutor$PrioritizedSplitRunner.process(TaskExecutor.java:529)', u'com.facebook.presto.execution.TaskExecutor$Runner.run(TaskExecutor.java:665)', u'java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)', u'java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)', u'java.lang.Thread.run(Thread.java:745)']}, u'errorName': u'GENERIC_INTERNAL_ERROR'}

## Tranpose Columns

In [42]:
v = 'v5'

rdd = sc.parallelize([(0,'a','b')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_a', 'transpose_b'])
df.write.parquet(dataset + v + partition + '1')

In [43]:
rdd = sc.parallelize([(1,'b','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_b', 'transpose_a'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [44]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [45]:
sqlContext.read.load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

In [46]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [47]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

### Presto

In [48]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [49]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transpose_b: varchar\
 |-- transpose_a: varchar\
 |-- type: varchar


In [50]:
cursor.execute("SELECT transpose_a, transpose_b FROM schema_evolution_{}".format(v))
cursor.fetchall()

[(u'a', u'b'), (u'b', u'a')]

## Transpose, Delete and Add Columns

In [51]:
v = 'v6'

rdd = sc.parallelize([(0,'r','t')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'removed', 'transposed'])
df.write.parquet(dataset + v + partition + '1')

In [52]:
rdd = sc.parallelize([(1,'t','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transposed', 'added'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [53]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- type: string (nullable = true)



In [54]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', type=u'1'),
 Row(id=1, removed=None, transposed=u't', type=u'2')]

In [55]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- added: string (nullable = true)
 |-- type: string (nullable = true)



In [56]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', added=None, type=u'1'),
 Row(id=1, removed=None, transposed=u't', added=u'a', type=u'2')]

### Presto

In [57]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [58]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transposed: varchar\
 |-- added: varchar\
 |-- type: varchar


In [59]:
cursor.execute("SELECT id, transposed, added, type FROM schema_evolution_{}".format(v))
cursor.fetchall()

[(0, u'r', u't', u'1'), (1, u't', u'a', u'2')]