The aim of this notebook is to express the semantics of schema evolution of parquet files using both Spark and parquet2hive + Presto. Note that all parquet2hive is doing is reading the schema *from the most recently created file*, so in some cases this could be changed without changing Presto's underlying facilities by reading *all* files.

In this notebook I am running Spark locally, and using a remote Presto cluster. To connect to this cluster I'm using parquet2hive_server [0], which is just a simple API for parquet2hive on the remote cluster. To run this notebook successfully, you'll need to run the following on the Presto cluster:

```
sudo pip install parquet2hive_server
start_parquet2hive_server
```

[0] http://www.github.com/fbertsch/parquet2hive_server

In [61]:
from parquet2hive_server.client import Parquet2HiveClient
from pyhive import presto
from pprint import pprint

presto_dns = 'ec2-54-149-100-125.us-west-2.compute.amazonaws.com'

client = Parquet2HiveClient(presto_dns + ':5129')

In [62]:
import boto3

bucket, prefix = "telemetry-test-bucket", "schema_evolution"
dataset = "s3://{}/{}/".format(bucket, prefix)
partition = '/type='

s3 = boto3.resource('s3')
objects_to_delete = s3.meta.client.list_objects(Bucket=bucket, Prefix=prefix)

delete_keys = {}
delete_keys['Objects'] = [{'Key' : k} for k in [obj['Key'] for obj in objects_to_delete.get('Contents', [])]]

try:
    _ = s3.meta.client.delete_objects(Bucket=bucket, Delete=delete_keys)
except Exception:
    pass

In [63]:
conn = presto.connect(host=presto_dns, port='8889')
cursor = conn.cursor()

def get_schema(_cursor, _v):
    """Prints the schema in a similar format to spark's dataframe.printSchema()"""
    _cursor.execute('describe schema_evolution_{}'.format(_v))
    return '\\\n'.join(['root'] + [' |-- {}: {}'.format(a, b) for a, b, _, _ in _cursor.fetchall()])

def execute(_cursor, _query):
    _cursor.execute(_query)
    results = _cursor.fetchall()
    colnames = [c[0] for c in _cursor.description]
    return '\\\n'.join([', '.join(['{}={}'.format(c,r) for c,r in zip(colnames, res)]) for res in results])

## Adding a Column

In [64]:
v = 'v1'

In [65]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [66]:
rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '2')

### Spark

In [67]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [68]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=0, type=u'2'),
 Row(id=1, type=u'2'),
 Row(id=0, type=u'1'),
 Row(id=1, type=u'1')]

In [69]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [70]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, score=0, type=u'2'),
 Row(id=1, score=1, type=u'2'),
 Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1')]

### Presto

In [71]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [72]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- score: bigint\
 |-- type: varchar


In [73]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=0, score=None, type=1\
id=1, score=None, type=1\
id=0, score=0, type=2\
id=1, score=1, type=2


## Removing a Column

In [74]:
v = 'v2'

rdd = sc.parallelize([(0,0),(1,1)], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'score'])
df.write.parquet(dataset + v + partition  + '1')

In [75]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [76]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [77]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=0, score=0, type=u'1'),
 Row(id=1, score=1, type=u'1'),
 Row(id=0, score=None, type=u'2'),
 Row(id=1, score=None, type=u'2')]

In [78]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [79]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, score=0, type=u'1'),
 Row(id=1, score=1, type=u'1'),
 Row(id=0, score=None, type=u'2'),
 Row(id=1, score=None, type=u'2')]

### Presto

In [80]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [81]:
print get_schema(cursor, v)

root\
 |-- id: bigint\
 |-- type: varchar


In [82]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=0, type=1\
id=1, type=1\
id=0, type=2\
id=1, type=2


## Renaming a Column

In [83]:
v = 'v3'

In [84]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [85]:
rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [86]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [87]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=None, type=u'2'),
 Row(id=None, type=u'2'),
 Row(id=0, type=u'1'),
 Row(id=1, type=u'1')]

In [88]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: long (nullable = true)
 |-- type: string (nullable = true)



In [89]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=None, score=0, type=u'2'),
 Row(id=None, score=1, type=u'2'),
 Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1')]

### Presto

In [90]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [91]:
print get_schema(cursor, v)

root\
 |-- score: bigint\
 |-- type: varchar


In [92]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

score=0, type=2\
score=1, type=2\
score=None, type=1\
score=None, type=1


### Replace Column

Note that this is similar to "rename column", but the new data has a different type.

In [93]:
v = 'v4'

rdd = sc.parallelize([(0,),(1,)], 1)
df = sqlContext.createDataFrame(rdd, ['id'])
df.write.parquet(dataset + v + partition + '1')

In [94]:
rdd = sc.parallelize([('a',),('b',)], 1)
df = sqlContext.createDataFrame(rdd, ['score'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [95]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- type: string (nullable = true)



In [96]:
df = sqlContext.read.load(dataset + v, 'parquet').collect()

In [97]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- score: string (nullable = true)
 |-- type: string (nullable = true)



In [98]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, score=None, type=u'1'),
 Row(id=1, score=None, type=u'1'),
 Row(id=None, score=u'a', type=u'2'),
 Row(id=None, score=u'b', type=u'2')]

### Presto

In [99]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [100]:
print get_schema(cursor, v)

root\
 |-- score: varchar\
 |-- type: varchar


In [101]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

score=None, type=1\
score=None, type=1\
score=a, type=2\
score=b, type=2


## Tranpose Columns

In [102]:
v = 'v5'

rdd = sc.parallelize([(0,'a','b')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_a', 'transpose_b'])
df.write.parquet(dataset + v + partition + '1')

In [103]:
rdd = sc.parallelize([(1,'b','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transpose_b', 'transpose_a'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [104]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [105]:
sqlContext.read.load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

In [106]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- transpose_a: string (nullable = true)
 |-- transpose_b: string (nullable = true)
 |-- type: string (nullable = true)



In [107]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').select('transpose_a','transpose_b').collect()

[Row(transpose_a=u'a', transpose_b=u'b'),
 Row(transpose_a=u'a', transpose_b=u'b')]

### Presto

In [108]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [109]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transpose_b: varchar\
 |-- transpose_a: varchar\
 |-- type: varchar


In [110]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=1, transpose_b=b, transpose_a=a, type=2\
id=0, transpose_b=b, transpose_a=a, type=1


## Transpose, Delete and Add Columns

In [111]:
v = 'v6'

rdd = sc.parallelize([(0,'r','t')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'removed', 'transposed'])
df.write.parquet(dataset + v + partition + '1')

In [112]:
rdd = sc.parallelize([(1,'t','a')], 1)
df = sqlContext.createDataFrame(rdd, ['id', 'transposed', 'added'])
df.write.parquet(dataset + v + partition + '2')

### Spark

In [113]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- type: string (nullable = true)



In [114]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', type=u'1'),
 Row(id=1, removed=None, transposed=u't', type=u'2')]

In [115]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- id: long (nullable = true)
 |-- removed: string (nullable = true)
 |-- transposed: string (nullable = true)
 |-- added: string (nullable = true)
 |-- type: string (nullable = true)



In [116]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(id=0, removed=u'r', transposed=u't', added=None, type=u'1'),
 Row(id=1, removed=None, transposed=u't', added=u'a', type=u'2')]

### Presto

In [117]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [118]:
print(get_schema(cursor, v))

root\
 |-- id: bigint\
 |-- transposed: varchar\
 |-- added: varchar\
 |-- type: varchar


In [119]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

id=1, transposed=t, added=a, type=2\
id=0, transposed=t, added=None, type=1


# Nested Row Type - Adding a Subcolumn

In [120]:
from pyspark.sql.types import *

v = 'v7'

df = sqlContext.createDataFrame([[[1, 'e']]], StructType([
    StructField("nested", 
                StructType([
                        StructField("id", LongType()), 
                        StructField("exists", StringType())
                    ])
               )
]))

df.write.parquet(dataset + v + partition + '1')

In [121]:
df = sqlContext.createDataFrame([[[1, 'e', 'a']]], StructType([
    StructField("nested", 
                StructType([
                        StructField("id", LongType()), 
                        StructField("exists", StringType()),
                        StructField("added", StringType())
                    ])
               )
]))
df.write.parquet(dataset + v + partition + '2')

## Spark

In [122]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- nested: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- exists: string (nullable = true)
 |-- type: string (nullable = true)



In [123]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(nested=Row(id=1, exists=u'e'), type=u'2'),
 Row(nested=Row(id=1, exists=u'e'), type=u'1')]

In [124]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- nested: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- exists: string (nullable = true)
 |    |-- added: string (nullable = true)
 |-- type: string (nullable = true)



In [125]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(nested=Row(id=1, exists=u'e', added=u'a'), type=u'2'),
 Row(nested=Row(id=1, exists=u'e', added=None), type=u'1')]

## Presto

In [126]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [127]:
print(get_schema(cursor, v))

root\
 |-- nested: row(id bigint, exists varchar, added varchar)\
 |-- type: varchar


In [128]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

DatabaseError: {u'errorCode': 16777219, u'message': u'Error opening Hive split s3://telemetry-test-bucket/schema_evolution/v7/type=1/part-r-00000-d4702419-81ea-43cf-b8e0-6b89bac2df96.snappy.parquet (offset=0, length=419): Schema mismatch, metastore schema for row column nested has 3 fields but parquet schema has 2 fields', u'errorType': u'EXTERNAL', u'failureInfo': {u'suppressed': [], u'cause': {u'suppressed': [], u'message': u'Schema mismatch, metastore schema for row column nested has 3 fields but parquet schema has 2 fields', u'type': u'java.lang.IllegalArgumentException', u'stack': [u'com.google.common.base.Preconditions.checkArgument(Preconditions.java:145)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor$ParquetStructConverter.<init>(ParquetHiveRecordCursor.java:743)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createGroupConverter(ParquetHiveRecordCursor.java:718)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.access$300(ParquetHiveRecordCursor.java:99)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor$PrestoReadSupport.<init>(ParquetHiveRecordCursor.java:434)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createParquetRecordReader(ParquetHiveRecordCursor.java:336)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.<init>(ParquetHiveRecordCursor.java:156)', u'com.facebook.presto.hive.parquet.ParquetRecordCursorProvider.createRecordCursor(ParquetRecordCursorProvider.java:92)', u'com.facebook.presto.hive.HivePageSourceProvider.createHivePageSource(HivePageSourceProvider.java:158)', u'com.facebook.presto.hive.HivePageSourceProvider.createPageSource(HivePageSourceProvider.java:88)', u'com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorPageSourceProvider.createPageSource(ClassLoaderSafeConnectorPageSourceProvider.java:44)', u'com.facebook.presto.split.PageSourceManager.createPageSource(PageSourceManager.java:56)', u'com.facebook.presto.operator.TableScanOperator.getOutput(TableScanOperator.java:253)', u'com.facebook.presto.operator.Driver.processInternal(Driver.java:378)', u'com.facebook.presto.operator.Driver.processFor(Driver.java:301)', u'com.facebook.presto.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:622)', u'com.facebook.presto.execution.TaskExecutor$PrioritizedSplitRunner.process(TaskExecutor.java:555)', u'com.facebook.presto.execution.TaskExecutor$Runner.run(TaskExecutor.java:691)', u'java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)', u'java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)', u'java.lang.Thread.run(Thread.java:745)']}, u'message': u'Error opening Hive split s3://telemetry-test-bucket/schema_evolution/v7/type=1/part-r-00000-d4702419-81ea-43cf-b8e0-6b89bac2df96.snappy.parquet (offset=0, length=419): Schema mismatch, metastore schema for row column nested has 3 fields but parquet schema has 2 fields', u'type': u'com.facebook.presto.spi.PrestoException', u'stack': [u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createParquetRecordReader(ParquetHiveRecordCursor.java:382)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.<init>(ParquetHiveRecordCursor.java:156)', u'com.facebook.presto.hive.parquet.ParquetRecordCursorProvider.createRecordCursor(ParquetRecordCursorProvider.java:92)', u'com.facebook.presto.hive.HivePageSourceProvider.createHivePageSource(HivePageSourceProvider.java:158)', u'com.facebook.presto.hive.HivePageSourceProvider.createPageSource(HivePageSourceProvider.java:88)', u'com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorPageSourceProvider.createPageSource(ClassLoaderSafeConnectorPageSourceProvider.java:44)', u'com.facebook.presto.split.PageSourceManager.createPageSource(PageSourceManager.java:56)', u'com.facebook.presto.operator.TableScanOperator.getOutput(TableScanOperator.java:253)', u'com.facebook.presto.operator.Driver.processInternal(Driver.java:378)', u'com.facebook.presto.operator.Driver.processFor(Driver.java:301)', u'com.facebook.presto.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:622)', u'com.facebook.presto.execution.TaskExecutor$PrioritizedSplitRunner.process(TaskExecutor.java:555)', u'com.facebook.presto.execution.TaskExecutor$Runner.run(TaskExecutor.java:691)', u'java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)', u'java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)', u'java.lang.Thread.run(Thread.java:745)']}, u'errorName': u'HIVE_CANNOT_OPEN_SPLIT'}

# Nested Row Type - Removing a Subcolumn

In [129]:
v = 'v8'

df = sqlContext.createDataFrame([[[1, 'e', 'r']]], StructType([
    StructField("nested", 
                StructType([
                        StructField("id", LongType()), 
                        StructField("exists", StringType()),
                        StructField("removed", StringType())
                    ])
               )
]))

df.write.parquet(dataset + v + partition + '1')

In [130]:
df = sqlContext.createDataFrame([[[1, 'e']]], StructType([
    StructField("nested", 
                StructType([
                        StructField("id", LongType()), 
                        StructField("exists", StringType())
                    ])
               )
]))

df.write.parquet(dataset + v + partition + '2')

## Spark

In [131]:
sqlContext.read.load(dataset + v, 'parquet').printSchema()

root
 |-- nested: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- exists: string (nullable = true)
 |    |-- removed: string (nullable = true)
 |-- type: string (nullable = true)



In [132]:
sqlContext.read.load(dataset + v, 'parquet').collect()

[Row(nested=Row(id=1, exists=u'e', removed=u'r'), type=u'1'),
 Row(nested=Row(id=1, exists=u'e', removed=None), type=u'2')]

In [133]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').printSchema()

root
 |-- nested: struct (nullable = true)
 |    |-- id: long (nullable = true)
 |    |-- exists: string (nullable = true)
 |    |-- removed: string (nullable = true)
 |-- type: string (nullable = true)



In [134]:
sqlContext.read.option("mergeSchema", "true").load(dataset + v, 'parquet').collect()

[Row(nested=Row(id=1, exists=u'e', removed=u'r'), type=u'1'),
 Row(nested=Row(id=1, exists=u'e', removed=None), type=u'2')]

## Presto

In [135]:
client.load(dataset=dataset, dv=v)

(200, u'{"Result": [null, null]}\n')

In [136]:
print(get_schema(cursor, v))

root\
 |-- nested: row(id bigint, exists varchar)\
 |-- type: varchar


In [137]:
print execute(cursor, "SELECT * FROM schema_evolution_{}".format(v))

DatabaseError: {u'errorCode': 16777219, u'message': u'Error opening Hive split s3://telemetry-test-bucket/schema_evolution/v8/type=1/part-r-00024-7eaefa5c-abc7-419a-94a5-081e2c608256.snappy.parquet (offset=0, length=500): Schema mismatch, metastore schema for row column nested has 2 fields but parquet schema has 3 fields', u'errorType': u'EXTERNAL', u'failureInfo': {u'suppressed': [], u'cause': {u'suppressed': [], u'message': u'Schema mismatch, metastore schema for row column nested has 2 fields but parquet schema has 3 fields', u'type': u'java.lang.IllegalArgumentException', u'stack': [u'com.google.common.base.Preconditions.checkArgument(Preconditions.java:145)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor$ParquetStructConverter.<init>(ParquetHiveRecordCursor.java:743)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createGroupConverter(ParquetHiveRecordCursor.java:718)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.access$300(ParquetHiveRecordCursor.java:99)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor$PrestoReadSupport.<init>(ParquetHiveRecordCursor.java:434)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createParquetRecordReader(ParquetHiveRecordCursor.java:336)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.<init>(ParquetHiveRecordCursor.java:156)', u'com.facebook.presto.hive.parquet.ParquetRecordCursorProvider.createRecordCursor(ParquetRecordCursorProvider.java:92)', u'com.facebook.presto.hive.HivePageSourceProvider.createHivePageSource(HivePageSourceProvider.java:158)', u'com.facebook.presto.hive.HivePageSourceProvider.createPageSource(HivePageSourceProvider.java:88)', u'com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorPageSourceProvider.createPageSource(ClassLoaderSafeConnectorPageSourceProvider.java:44)', u'com.facebook.presto.split.PageSourceManager.createPageSource(PageSourceManager.java:56)', u'com.facebook.presto.operator.TableScanOperator.getOutput(TableScanOperator.java:253)', u'com.facebook.presto.operator.Driver.processInternal(Driver.java:378)', u'com.facebook.presto.operator.Driver.processFor(Driver.java:301)', u'com.facebook.presto.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:622)', u'com.facebook.presto.execution.TaskExecutor$PrioritizedSplitRunner.process(TaskExecutor.java:555)', u'com.facebook.presto.execution.TaskExecutor$Runner.run(TaskExecutor.java:691)', u'java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)', u'java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)', u'java.lang.Thread.run(Thread.java:745)']}, u'message': u'Error opening Hive split s3://telemetry-test-bucket/schema_evolution/v8/type=1/part-r-00024-7eaefa5c-abc7-419a-94a5-081e2c608256.snappy.parquet (offset=0, length=500): Schema mismatch, metastore schema for row column nested has 2 fields but parquet schema has 3 fields', u'type': u'com.facebook.presto.spi.PrestoException', u'stack': [u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.createParquetRecordReader(ParquetHiveRecordCursor.java:382)', u'com.facebook.presto.hive.parquet.ParquetHiveRecordCursor.<init>(ParquetHiveRecordCursor.java:156)', u'com.facebook.presto.hive.parquet.ParquetRecordCursorProvider.createRecordCursor(ParquetRecordCursorProvider.java:92)', u'com.facebook.presto.hive.HivePageSourceProvider.createHivePageSource(HivePageSourceProvider.java:158)', u'com.facebook.presto.hive.HivePageSourceProvider.createPageSource(HivePageSourceProvider.java:88)', u'com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorPageSourceProvider.createPageSource(ClassLoaderSafeConnectorPageSourceProvider.java:44)', u'com.facebook.presto.split.PageSourceManager.createPageSource(PageSourceManager.java:56)', u'com.facebook.presto.operator.TableScanOperator.getOutput(TableScanOperator.java:253)', u'com.facebook.presto.operator.Driver.processInternal(Driver.java:378)', u'com.facebook.presto.operator.Driver.processFor(Driver.java:301)', u'com.facebook.presto.execution.SqlTaskExecution$DriverSplitRunner.processFor(SqlTaskExecution.java:622)', u'com.facebook.presto.execution.TaskExecutor$PrioritizedSplitRunner.process(TaskExecutor.java:555)', u'com.facebook.presto.execution.TaskExecutor$Runner.run(TaskExecutor.java:691)', u'java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)', u'java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)', u'java.lang.Thread.run(Thread.java:745)']}, u'errorName': u'HIVE_CANNOT_OPEN_SPLIT'}