In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))
              

schema = StructType([ StructField("zip", IntegerType(), True),
                      StructField("name", StringType(), False),
                      StructField("street", StringType(), True),
                      StructField("city", StringType(), True),
                      StructField("state", StringType(), True)
                    ])

business_df = ss.createDataFrame(business, schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/17 04:40:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
business_df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



                                                                                

## print 5 zip code with the most businesses

In [3]:
business_df.groupBy("zip").count().orderBy("count", ascending = False).show(5)

[Stage 1:>                                                          (0 + 2) / 2]

+-----+-----+
|  zip|count|
+-----+-----+
|94110|12459|
|94103|10919|
|94109| 9623|
|94107| 9394|
|94102| 7962|
+-----+-----+
only showing top 5 rows



                                                                                

## Create a column named "onHoward" to see whether it is on Howard street

In [4]:
business_df.withColumn('OnHoward', business_df['street'].contains('Howard'))\
           .filter("OnHoward == True")\
           .show(5)

+-----+--------------------+--------------+-------------+-----+--------+
|  zip|                name|        street|         city|state|OnHoward|
+-----+--------------------+--------------+-------------+-----+--------+
|94105|Stephens Institut...| 631 Howard St|San Francisco|   CA|    true|
|94103|Anderson Enterpri...|1525 Howard St|San Francisco|   CA|    true|
|94103|Avis Rent A Car S...| 821 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|    true|
+-----+--------------------+--------------+-------------+-----+--------+
only showing top 5 rows



In [5]:
business_df.filter(business_df['street'].contains('Howard')).show(5)

+-----+--------------------+--------------+-------------+-----+
|  zip|                name|        street|         city|state|
+-----+--------------------+--------------+-------------+-----+
|94105|Stephens Institut...| 631 Howard St|San Francisco|   CA|
|94103|Anderson Enterpri...|1525 Howard St|San Francisco|   CA|
|94103|Avis Rent A Car S...| 821 Howard St|San Francisco|   CA|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|
|94103|German Motors Cor...|1675 Howard St|San Francisco|   CA|
+-----+--------------------+--------------+-------------+-----+
only showing top 5 rows



In [6]:
ss.stop()