
# Brazilian Airline Historical Series Analysis

#### Frederico Horst

### Data Sources:
- Historical air fares by origin, destination and airline: available at [ANAC website](https://sistemas.anac.gov.br/sas/downloads/view/frmDownload.aspx)
- Inflation data, using IPCA index: available at [IBGE website](https://www.ibge.gov.br/estatisticas/economicas/precos-e-custos/9256-indice-nacional-de-precos-ao-consumidor-amplo.html?=&t=series-historicas)
- More information on air fares on [ANAC website](https://www.anac.gov.br/assuntos/dados-e-estatisticas/mercado-do-transporte-aereo)

### Goals:
- Build a database for historical deflated prices.
- Calculate the confidence interval for the average price range by route, considering a 95% confidence.
- Confidence intervals will be calculated by route, not considering airline differences. We want to take a closer look to the consumer point of view.


In [1]:
# external libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import ArrayType, IntegerType

# internal lib:
import files_processor
import basic_statistics

# spark configs:
spark = SparkSession.builder \
        .master("local") \
        .appName("anac-prices") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

sqlContext = SQLContext(spark)

In [2]:
# importing and cleaning files
anac_table = files_processor.files_cleaning(
    path_source='csv_files_from_anac',
    inflation_file='ipca_historico.csv')

anac_table.registerTempTable('anac_table')
anac_table.show()

###########################################################################
BEGGINING DATA CLEANING PROCESS
importing IPCA file
Inflation series imported successfully
#########################
beginning data cleaning
THIS IS THE END
###########################################################################
+----+-----+----------+-------+------+-----------+---------+------+-----+------------------+
|year|month|year_month|company|origin|destination|    route|tariff|seats|   deflated_tariff|
+----+-----+----------+-------+------+-----------+---------+------+-----+------------------+
|2015|   11|    201511|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|    5|22.152816007370042|
|2015|   11|    201511|    AZU|  SBGR|       SBCA|SBGR>SBCA| 305.9|    1| 6.873462234156097|
|2015|   11|    201511|    AZU|  SBPJ|       SBPA|SBPJ>SBPA|187.39|    2| 4.210585446415531|
|2015|   11|    201511|    AZU|  SBCT|       SBSP|SBCT>SBSP|100.01|   13| 2.247188486557539|
|2015|   11|    201511|    AZU|  SBPA|  

#### Top 20 routes for all time:

In [3]:
top20_query = """
    SELECT
        route,
        origin,
        destination,
        SUM(seats) AS sold_seats

    FROM anac_table

    GROUP BY route, origin, destination ORDER BY sold_seats DESC

    LIMIT 20
    """

top20_routes = sqlContext.sql(top20_query)
top20_routes.show()


+---------+------+-----------+----------+
|    route|origin|destination|sold_seats|
+---------+------+-----------+----------+
|SBRJ>SBSP|  SBRJ|       SBSP|  16819889|
|SBSP>SBRJ|  SBSP|       SBRJ|  16713324|
|SBGR>SBSV|  SBGR|       SBSV|   6228993|
|SBSV>SBGR|  SBSV|       SBGR|   6227379|
|SBSP>SBBR|  SBSP|       SBBR|   6221774|
|SBBR>SBSP|  SBBR|       SBSP|   6164102|
|SBSP>SBPA|  SBSP|       SBPA|   5273860|
|SBPA>SBSP|  SBPA|       SBSP|   5200499|
|SBSP>SBCT|  SBSP|       SBCT|   4935636|
|SBCT>SBSP|  SBCT|       SBSP|   4880941|
|SBGR>SBRF|  SBGR|       SBRF|   4702913|
|SBRF>SBGR|  SBRF|       SBGR|   4607603|
|SBPA>SBGR|  SBPA|       SBGR|   4596957|
|SBGR>SBPA|  SBGR|       SBPA|   4555837|
|SBSV>SBGL|  SBSV|       SBGL|   4419938|
|SBGL>SBSV|  SBGL|       SBSV|   4342069|
|SBRJ>SBBR|  SBRJ|       SBBR|   3763260|
|SBBR>SBRJ|  SBBR|       SBRJ|   3746032|
|SBGL>SBBR|  SBGL|       SBBR|   3713156|
|SBBR>SBGL|  SBBR|       SBGL|   3624796|
+---------+------+-----------+----

In [4]:
# Frequency in this dataset is determined by seats column. So we will replicate the lines according to the 
# number of sold seats, so we can better calculate the confidence interval.
# https://stackoverflow.com/questions/50624745/pyspark-how-to-duplicate-a-row-n-time-in-dataframe

# anac_table.registerTempTable('anac_table')

# use udf function to transform the n value to n times
n_to_array = udf(lambda n : [n] * n, ArrayType(IntegerType()))
anac_table_exploded = anac_table.withColumn('seats', n_to_array(anac_table.seats))

# now use explode 
anac_table_exploded = anac_table_exploded.withColumn('seats', explode(anac_table_exploded.seats))

anac_table_exploded = anac_table_exploded.select('year_month', 'year', 'month', 'company', 'origin', 'destination', 'route', 'tariff', 'deflated_tariff')


In [5]:
anac_table_exploded.show()

+----------+----+-----+-------+------+-----------+---------+------+------------------+
|year_month|year|month|company|origin|destination|    route|tariff|   deflated_tariff|
+----------+----+-----+-------+------+-----------+---------+------+------------------+
|    201511|2015|   11|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|2015|   11|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|2015|   11|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|2015|   11|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|2015|   11|    AZU|  SBPA|       SBSL|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|2015|   11|    AZU|  SBGR|       SBCA|SBGR>SBCA| 305.9| 6.873462234156097|
|    201511|2015|   11|    AZU|  SBPJ|       SBPA|SBPJ>SBPA|187.39| 4.210585446415531|
|    201511|2015|   11|    AZU|  SBPJ|       SBPA|SBPJ>SBPA|187.39| 4.210585446415531|
|    201511|2015|   11|    AZU|  SBCT|     

### Calculating the Confidence Interval 

In [6]:
anac_sts_top20 = anac_table_exploded.select('year_month', 'route', 'tariff', 'deflated_tariff') \
                        .filter(anac_table_exploded.route.isin(top20_routes.route)) \
                        # .toPandas()
anac_sts_top20.show()

+----------+---------+------+------------------+
|year_month|    route|tariff|   deflated_tariff|
+----------+---------+------+------------------+
|    201511|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|SBPA>SBSL| 985.9|22.152816007370042|
|    201511|SBGR>SBCA| 305.9| 6.873462234156097|
|    201511|SBPJ>SBPA|187.39| 4.210585446415531|
|    201511|SBPJ>SBPA|187.39| 4.210585446415531|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBSP|100.01| 2.247188486557539|
|    201511|SBCT>SBS

In [7]:
year_months = anac_table_exploded.select('year_month').distinct()
year_months.show()

+----------+
|year_month|
+----------+
|    200206|
|    201809|
|    200508|
|    201001|
|    201210|
|    200801|
|    200606|
|    201503|
|    201607|
|    200902|
|    200203|
|    201903|
|    201208|
|    201702|
|    200202|
|    201204|
|    200511|
|    200802|
|    201510|
|    201909|
+----------+
only showing top 20 rows



In [11]:


# criando listas de dados para colocar no dataframe
tariff_mean_percentile_025 = []
tariff_mean_percentile_500 = []
tariff_mean_percentile_975 = []
bs_size = 10

# calculando o intervalo de confiança da média da tarifa: ######
print('calculando o intervalo de confiança da média da tarifa')
for i in year_months:
    # df = anac_sts_top20.select('deflated_tariff').filter(anac_sts_top20.year_month.isin(i))
    deflated_tariff = anac_sts_top20.select('deflated_tariff').filter(anac_sts_top20.year_month.isin(i))
    print('filtrou')
    deflated_tariff = np.array(deflated_tariff.collect())
    print(deflated_tariff)
    # deflated_tariff = np.array(df.select('deflated_tariff').collect())
    print('coletou no np')
    ci_mean_tariff = basic_statistics.draw_bs_reps(deflated_tariff, np.mean, size = bs_size)
    print('calculated confidence intervals for ' + str(i))
    tariff_mean_percentile_025.append(np.percentile(ci_mean_tariff, 2.5))
    tariff_mean_percentile_975.append(np.percentile(ci_mean_tariff, 97.5))
    tariff_mean_percentile_500.append(np.percentile(ci_mean_tariff, 50.0)) # median
    df = None

anac_sts1.tariff_mean_percentile_025 = tariff_mean_percentile_025
anac_sts1.tariff_mean_percentile_975 = tariff_mean_percentile_975
anac_sts1.tariff_mean_percentile_500 = tariff_mean_percentile_500

calculando o intervalo de confiança da média da tarifa
filtrou


Py4JJavaError: An error occurred while calling o411.collectToPython.
: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:205)
	at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:515)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:193)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:189)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:203)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareRelation(BroadcastHashJoinExec.scala:217)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenOuter(HashJoin.scala:497)
	at org.apache.spark.sql.execution.joins.HashJoin.codegenOuter$(HashJoin.scala:496)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenOuter(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume(HashJoin.scala:352)
	at org.apache.spark.sql.execution.joins.HashJoin.doConsume$(HashJoin.scala:349)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:87)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:87)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.FilterExec.consume(basicPhysicalOperators.scala:113)
	at org.apache.spark.sql.execution.FilterExec.doConsume(basicPhysicalOperators.scala:238)
	at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:194)
	at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:149)
	at org.apache.spark.sql.execution.InputAdapter.consume(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce(WholeStageCodegenExec.scala:483)
	at org.apache.spark.sql.execution.InputRDDCodegen.doProduce$(WholeStageCodegenExec.scala:456)
	at org.apache.spark.sql.execution.InputAdapter.doProduce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.InputAdapter.produce(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.FilterExec.doProduce(basicPhysicalOperators.scala:153)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.FilterExec.produce(basicPhysicalOperators.scala:113)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:54)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:54)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce(HashJoin.scala:346)
	at org.apache.spark.sql.execution.joins.HashJoin.doProduce$(HashJoin.scala:345)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:40)
	at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:54)
	at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:95)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:90)
	at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:41)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:655)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:718)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.python.EvalPythonExec.doExecute(EvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec.doExecute$(EvalPythonExec.scala:87)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.doExecute(BatchEvalPythonExec.scala:34)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.GenerateExec.doExecute(GenerateExec.scala:80)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:525)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:453)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:452)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:496)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:50)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:746)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:321)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:387)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:3519)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3516)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.util.concurrent.TimeoutException
	at java.util.concurrent.FutureTask.get(FutureTask.java:205)
	at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:194)
	... 146 more


In [None]:
# https://seaborn.pydata.org/examples/jitter_stripplot.html