In [2]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [10]:
# (producct id, product name, qty)
data = [[1, "product1", 10],
        [2, "product2", 20]]

# Defining schema for dataframes
# StructField(column_name, column_type, nullable?)

from pyspark.sql.types import *
schema = StructType([
    StructField('product_id', IntegerType(), False),
    StructField('product_name', StringType(), False),
    StructField('qty', IntegerType(), False)
])

In [11]:
df = spark.createDataFrame(data, schema)

In [12]:
df.show()

+----------+------------+---+
|product_id|product_name|qty|
+----------+------------+---+
|         1|    product1| 10|
|         2|    product2| 20|
+----------+------------+---+



In [13]:
df.printSchema()

root
 |-- product_id: integer (nullable = false)
 |-- product_name: string (nullable = false)
 |-- qty: integer (nullable = false)



In [14]:
# Automated schema inferring
df2 = spark.createDataFrame(data, ['product_id', 'product_name', 'qty'])

In [15]:
df2.printSchema()

root
 |-- product_id: long (nullable = true)
 |-- product_name: string (nullable = true)
 |-- qty: long (nullable = true)



In [21]:
!head -10 spark_training_baseline/data/flights.csv

DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
United States,Romania,15
United States,Croatia,1
United States,Ireland,344
Egypt,United States,15
United States,India,62
United States,Singapore,1
United States,Grenada,62
Costa Rica,United States,588
Senegal,United States,40


In [22]:
from pyspark.sql.types import StructField, StructType, StringType, LongType
csv_schema = StructType([
    # StructField (name, dataType, nullable, metadata)
    StructField("DEST_COUNTRY_NAME", StringType(), True),
    StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
    StructField("count", LongType(), False)    
])

# spark.read is a DataFrameReader singleton class
df = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .schema(csv_schema) \
    .load('spark_training_baseline/data/flights.csv')
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [23]:
df.createOrReplaceTempView('flights')

In [25]:
spark.sql("""
SELECT * FROM flights LIMIT 10;
""").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+



In [43]:
# Excercise: find the origin country which has the largest number outbound flights from
df_spark = spark.sql("""
SELECT ORIGIN_COUNTRY_NAME, sum(count) as total_outbound
FROM flights
GROUP BY ORIGIN_COUNTRY_NAME
ORDER BY total_outbound DESC
LIMIT 1;
""")
df_spark.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=1, orderBy=[total_outbound#312L DESC NULLS LAST], output=[ORIGIN_COUNTRY_NAME#45,total_outbound#312L])
+- *(2) HashAggregate(keys=[ORIGIN_COUNTRY_NAME#45], functions=[sum(count#46L)])
   +- Exchange hashpartitioning(ORIGIN_COUNTRY_NAME#45, 200), true, [id=#451]
      +- *(1) HashAggregate(keys=[ORIGIN_COUNTRY_NAME#45], functions=[partial_sum(count#46L)])
         +- FileScan csv [ORIGIN_COUNTRY_NAME#45,count#46L] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/data/flights.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ORIGIN_COUNTRY_NAME:string,count:bigint>




In [44]:
from pyspark.sql.functions import *
df_python = df.groupBy('ORIGIN_COUNTRY_NAME').agg(sum('count').alias('total_outbound')) \
    .sort(desc('total_outbound')).limit(1)
df_python.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=1, orderBy=[total_outbound#323L DESC NULLS LAST], output=[ORIGIN_COUNTRY_NAME#45,total_outbound#323L])
+- *(2) HashAggregate(keys=[ORIGIN_COUNTRY_NAME#45], functions=[sum(count#46L)])
   +- Exchange hashpartitioning(ORIGIN_COUNTRY_NAME#45, 200), true, [id=#475]
      +- *(1) HashAggregate(keys=[ORIGIN_COUNTRY_NAME#45], functions=[partial_sum(count#46L)])
         +- FileScan csv [ORIGIN_COUNTRY_NAME#45,count#46L] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/home/andras/ipython_spark/spark_training_baseline/data/flights.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<ORIGIN_COUNTRY_NAME:string,count:bigint>




In [49]:
# 1. How to read a (unstructed) text file to RDD
rdd = sc.textFile('spark_training_baseline/data/flights.csv')
rdd2 = rdd.map(lambda x: x.upper()).map(lambda x: x.split(','))
rdd2.take(10)

# 2. How to convert RDD to dataframe
df_from_rdd = rdd2.toDF(['col1', 'col2', 'col3'])
df_from_rdd.show()


+--------------------+-------------------+-----+
|                col1|               col2| col3|
+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|COUNT|
|       UNITED STATES|            ROMANIA|   15|
|       UNITED STATES|            CROATIA|    1|
|       UNITED STATES|            IRELAND|  344|
|               EGYPT|      UNITED STATES|   15|
|       UNITED STATES|              INDIA|   62|
|       UNITED STATES|          SINGAPORE|    1|
|       UNITED STATES|            GRENADA|   62|
|          COSTA RICA|      UNITED STATES|  588|
|             SENEGAL|      UNITED STATES|   40|
|             MOLDOVA|      UNITED STATES|    1|
|       UNITED STATES|       SINT MAARTEN|  325|
|       UNITED STATES|   MARSHALL ISLANDS|   39|
|              GUYANA|      UNITED STATES|   64|
|               MALTA|      UNITED STATES|    1|
|            ANGUILLA|      UNITED STATES|   41|
|             BOLIVIA|      UNITED STATES|   30|
|       UNITED STATE