In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import asc, desc


In [32]:
input_file = "../Data/FoodTruckSF_Narrow.csv"

In [18]:
schema = StructType([ StructField("DayOrder", IntegerType(), True),
                      StructField("DayOfWeek", StringType(), False),
                      StructField("Starttime", StringType(), True),
                      StructField("Endtime", StringType(), True),
                      StructField("Permit", StringType(), False),
                      StructField("LocationID", StringType(), True),
                      StructField("Start_24", StringType(), True),
                      StructField("End_24", StringType(), True),
                      StructField("Lat", FloatType(), True),
                      StructField("Lon", FloatType(), True)
                    ])


In [19]:
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

foodtruck_rdd = sc.textFile(input_file)\
                  .map(lambda x : x.split(','))\
                  .map(lambda x : [int(x[0]), x[1], x[2], x[3], x[4], x[5], x[6], x[7], float(x[8]), float(x[9])])

foodtruck = ss.createDataFrame(foodtruck_rdd, schema = schema)


root
 |-- DayOrder: integer (nullable = true)
 |-- DayOfWeek: string (nullable = false)
 |-- Starttime: string (nullable = true)
 |-- Endtime: string (nullable = true)
 |-- Permit: string (nullable = false)
 |-- LocationID: string (nullable = true)
 |-- Start_24: string (nullable = true)
 |-- End_24: string (nullable = true)
 |-- Lat: float (nullable = true)
 |-- Lon: float (nullable = true)



## Print the schema of the data frame

In [22]:
foodtruck.printSchema()

root
 |-- DayOrder: integer (nullable = true)
 |-- DayOfWeek: string (nullable = false)
 |-- Starttime: string (nullable = true)
 |-- Endtime: string (nullable = true)
 |-- Permit: string (nullable = false)
 |-- LocationID: string (nullable = true)
 |-- Start_24: string (nullable = true)
 |-- End_24: string (nullable = true)
 |-- Lat: float (nullable = true)
 |-- Lon: float (nullable = true)



### print the number of unique lines.

In [23]:
foodtruck.distinct().count()

3141

## print 10 records which opens earliest but closes latest on Sunday (DayOrder : 0).

In [33]:
foodtruck.filter("DayOrder == 0").orderBy([foodtruck["Starttime"], foodtruck["Endtime"]], ascending = [True, False])\
                                .show(10)

+--------+---------+---------+-------+----------+----------+--------+------+---------+-----------+
|DayOrder|DayOfWeek|Starttime|Endtime|    Permit|LocationID|Start_24|End_24|      Lat|        Lon|
+--------+---------+---------+-------+----------+----------+--------+------+---------+-----------+
|       0|   Sunday|     10AM|    9PM|13MFF-0112|    453012|   10:00| 21:00|37.730232|-122.402756|
|       0|   Sunday|     10AM|    9PM|18MFF-0011|   1122750|   10:00| 21:00| 37.77591| -122.39006|
|       0|   Sunday|     10AM|    8PM|11MFF-0175|    334914|   10:00| 20:00|37.783947|-122.408005|
|       0|   Sunday|     10AM|    8PM|17MFF-0177|    959619|   10:00| 20:00|37.776405| -122.39411|
|       0|   Sunday|     10AM|    8PM|18MFF-0104|   1219114|   10:00| 20:00| 37.79126| -122.39866|
|       0|   Sunday|     10AM|    7PM|18MFF-0005|   1090081|   10:00| 19:00|37.769344| -122.41333|
|       0|   Sunday|     10AM|    6PM|18MFF-0064|   1186247|   10:00| 18:00|37.784527| -122.40727|
|       0|