In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

from datetime import datetime
def toIntSafe(inval):
    try:
        return int(inval)
    except ValueError:
        return None

def toTimeSafe(inval):
    inval = inval.strip("\"") # Timestamp starting and ending with a double quotation mark.
    try:
        return datetime.strptime(inval, "%Y-%m-%d %H:%M:%S")
    except ValueError:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

status = sc.textFile("../Data/bike_share/status_million.csv")\
           .map(lambda x : x.split(","))\
           .map(lambda x : (int(x[0]), toIntSafe(x[1]), toIntSafe(x[2]), toTimeSafe(x[3])))

schema = StructType([ StructField("station_id", IntegerType(), False),
                      StructField("num_bikes_available", IntegerType(), True),
                      StructField("num_docks_available", IntegerType(), True),
                      StructField("timestamp", TimestampType(), True)
                    ])


status_df = ss.createDataFrame(status, schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/17 04:48:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Calculate min, max, average num_bike_available per station_id.


In [2]:
status_df.groupBy('station_id').min('num_bikes_available').show()
status_df.groupBy('station_id').avg('num_bikes_available').show()
status_df.groupBy('station_id').max('num_bikes_available').show()

                                                                                

+----------+------------------------+
|station_id|min(num_bikes_available)|
+----------+------------------------+
|        10|                       0|
|        11|                       0|
+----------+------------------------+



                                                                                

+----------+------------------------+
|station_id|avg(num_bikes_available)|
+----------+------------------------+
|        10|       5.931166125246599|
|        11|       7.768450198057421|
+----------+------------------------+





+----------+------------------------+
|station_id|max(num_bikes_available)|
+----------+------------------------+
|        10|                      15|
|        11|                      19|
+----------+------------------------+



                                                                                

In [3]:
status_df.groupBy('station_id')\
         .agg(min('num_bikes_available'), avg('num_bikes_available'), max('num_bikes_available'))\
         .show()



+----------+------------------------+------------------------+------------------------+
|station_id|min(num_bikes_available)|avg(num_bikes_available)|max(num_bikes_available)|
+----------+------------------------+------------------------+------------------------+
|        10|                       0|       5.931166125246599|                      15|
|        11|                       0|       7.768450198057421|                      19|
+----------+------------------------+------------------------+------------------------+



                                                                                

In [4]:
ss.stop()