In [1]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [22]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
csv_schema = StructType([
    # StructField (name, dataType, nullable, metadata)
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False)    
])

# spark.read is a DataFrameReader singleton class
df = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .schema(csv_schema) \
    .load('spark_training_baseline/data/flights_multiple')

#.load('spark_training_baseline/data/flights.csv')
df.show()

print(df.rdd.getNumPartitions())

+--------------------+--------------------+-----+
|   DEST_COUNTRY_NAME| ORIGIN_COUNTRY_NAME|count|
+--------------------+--------------------+-----+
|       United States|            Ethiopia|   12|
|       United States|              Panama|  465|
|       United States|               Aruba|  342|
|       United States|            Thailand|    4|
|       United States|Turks and Caicos ...|  236|
|             Croatia|       United States|    2|
|       United States|            Pakistan|   12|
|              Cyprus|       United States|    1|
|       United States|            Honduras|  407|
|                Fiji|       United States|   24|
|               Qatar|       United States|  108|
|Saint Kitts and N...|       United States|  139|
|              Kuwait|       United States|   32|
|              Taiwan|       United States|  266|
|               Haiti|       United States|  226|
|              Canada|       United States| 8399|
|Federated States ...|       United States|   69|


In [5]:
!head -10 spark_training_baseline/data/flights.json

{"ORIGIN_COUNTRY_NAME":"Romania","DEST_COUNTRY_NAME":"United States","count":15}
{"ORIGIN_COUNTRY_NAME":"Croatia","DEST_COUNTRY_NAME":"United States","count":1}
{"ORIGIN_COUNTRY_NAME":"Ireland","DEST_COUNTRY_NAME":"United States","count":344}
{"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Egypt","count":15}
{"ORIGIN_COUNTRY_NAME":"India","DEST_COUNTRY_NAME":"United States","count":62}
{"ORIGIN_COUNTRY_NAME":"Singapore","DEST_COUNTRY_NAME":"United States","count":1}
{"ORIGIN_COUNTRY_NAME":"Grenada","DEST_COUNTRY_NAME":"United States","count":62}
{"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Costa Rica","count":588}
{"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Senegal","count":40}
{"ORIGIN_COUNTRY_NAME":"United States","DEST_COUNTRY_NAME":"Moldova","count":1}


In [13]:
df_json = spark.read \
    .format('json') \
    .option('compression', 'gzip') \
    .schema(csv_schema) \
    .load('/home/andras/git/spark/data/flights.json.gz')
df_json.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [16]:
df.createOrReplaceTempView('flights')
result_df = spark.sql("""
SELECT ORIGIN_COUNTRY_NAME, sum(count) as total_outbound
FROM flights
GROUP BY ORIGIN_COUNTRY_NAME
ORDER BY total_outbound DESC;
""")
result_df.show()

+-------------------+--------------+
|ORIGIN_COUNTRY_NAME|total_outbound|
+-------------------+--------------+
|      United States|        411966|
|             Canada|          8483|
|             Mexico|          7187|
|     United Kingdom|          1970|
|              Japan|          1496|
| Dominican Republic|          1420|
|            Germany|          1336|
|        The Bahamas|           986|
|             France|           952|
|              China|           920|
|           Colombia|           867|
|        South Korea|           827|
|            Jamaica|           712|
|        Netherlands|           660|
|             Brazil|           619|
|         Costa Rica|           608|
|        El Salvador|           508|
|               Cuba|           478|
|             Panama|           465|
|              Spain|           442|
+-------------------+--------------+
only showing top 20 rows



In [17]:
result_df.write \
    .format('csv') \
    .option('header', 'true') \
    .option('sep', ';') \
    .mode('overwrite') \
    .save('data/flgiths_stat')   

In [23]:
!ls data/flgiths_stat

_SUCCESS
part-00000-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00001-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00002-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00003-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00004-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00005-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00006-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00007-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00008-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00009-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00010-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00011-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00012-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00013-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00014-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00015-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part-00016-181315c0-ec4a-429e-b0c7-d86af97ed692-c000.csv
part

In [24]:
df2 = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option('sep', ';') \
    .load('data/flgiths_stat')   

In [25]:
df2.rdd.getNumPartitions()

6

In [30]:
df3 = df2.repartition(3)
df3.rdd.getNumPartitions()

3

In [33]:
df4 = df3.repartition(1)
df4.write \
    .format('csv') \
    .option('header', 'true') \
    .option('sep', ';') \
    .mode('overwrite') \
    .save('data/flgiths_stat2')   

In [36]:
!ls data/flgiths_stat2

_SUCCESS  part-00000-28abe62b-d7c2-4c4d-8a1d-ffa44f89428b-c000.csv


In [35]:
#df4 = df3.repartition(1)
df4 = df3.coalesce(1)
df4.write \
    .format('csv') \
    .option('header', 'true') \
    .option('sep', ';') \
    .mode('overwrite') \
    .save('data/flgiths_stat2')   

In [38]:
df_json.write \
    .format('csv') \
    .option('header', 'true') \
    .option('sep', ';') \
    .mode('overwrite') \
    .partitionBy('ORIGIN_COUNTRY_NAME') \
    .save('data/flgiths_stat2')   

In [39]:
!pwd


/home/andras/ipython_spark


In [40]:
df4.write \
    .format('parquet') \
    .mode('overwrite') \
    .save('data/flgiths_pq')   

In [41]:
!ls data/flgiths_pq

_SUCCESS  part-00000-2514fffd-f089-44bd-8be2-fd1bca653e06-c000.snappy.parquet


In [42]:
df_from_pq = spark.read \
    .format('parquet') \
    .load('data/flgiths_pq')   

In [45]:
df_from_pq.select('ORIGIN_COUNTRY_NAME').show()

+--------------------+
| ORIGIN_COUNTRY_NAME|
+--------------------+
|               Egypt|
|Bonaire, Sint Eus...|
| Trinidad and Tobago|
|           Gibraltar|
|             Croatia|
|             Bolivia|
|    French Polynesia|
|  Dominican Republic|
|             Namibia|
|             Liberia|
|            Pakistan|
|             Ukraine|
|        Cook Islands|
|           Greenland|
|      United Kingdom|
|         South Korea|
|              Guyana|
|             Austria|
|         Netherlands|
|             Romania|
+--------------------+
only showing top 20 rows



In [44]:
!pwd

/home/andras/ipython_spark
