In [115]:
from pyspark import SparkContext 
sc = SparkContext.getOrCreate()

In [116]:
business = sc.textFile('../Data/filtered_registered_business_sf.csv').map(lambda x: x.split(','))

In [117]:
business.first()

['94123', 'Tournahu George L', '3301 Broderick St', 'San Francisco', 'CA']

In [118]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

In [119]:
# Step 2. (option 1) Convert RDD to DataFrame using .toDF()

In [120]:
business_df = business.toDF()

In [121]:
business_df.show(5)

+-----+--------------------+--------------------+-------------+---+
|   _1|                  _2|                  _3|           _4| _5|
+-----+--------------------+--------------------+-------------+---+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco| CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco| CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco| CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco| CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco| CA|
+-----+--------------------+--------------------+-------------+---+
only showing top 5 rows



In [122]:
business_df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)
 |-- _5: string (nullable = true)




### Step 2. (option 2) Add column names using a list of strings when creating a dataframe


In [123]:
business_df = business.toDF(['zip', 'name', 'street', 'city', 'state' ])

In [124]:
business_df.show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows




### Extra - create a data frame with name and zip code only.


In [125]:
business_df_2 = business.map(lambda x: (x[1], x[0])).toDF([ 'name','zip'])
# toDF(['zip', 'name', 'street', 'city', 'state' ])
business_df_2.show(5)

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows



# Create an RDD using filtered_registered_business_sf.csv seperated by commas 1. Using createDataFrame(), create a dataframe

In [126]:
business_df3 = ss.createDataFrame(business)

In [127]:
business_df3.show(5)

+-----+--------------------+--------------------+-------------+---+
|   _1|                  _2|                  _3|           _4| _5|
+-----+--------------------+--------------------+-------------+---+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco| CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco| CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco| CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco| CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco| CA|
+-----+--------------------+--------------------+-------------+---+
only showing top 5 rows



In [128]:
business_df3.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)
 |-- _5: string (nullable = true)



# 2. Adding Schema

In [129]:
from pyspark.sql.types import *

In [130]:
schema = StructType([StructField('zip', LongType(), True),
                    StructField('name', StringType(), False),
                    StructField("street", StringType(), True),
                    StructField("city", StringType(), True),
                    StructField("state", StringType(), True)
                    ])

In [131]:
schema

StructType(List(StructField(zip,LongType,true),StructField(name,StringType,false),StructField(street,StringType,true),StructField(city,StringType,true),StructField(state,StringType,true)))

In [132]:
business_df_3 = ss.createDataFrame(business, schema)

In [133]:
business_df_3.printSchema()

root
 |-- zip: long (nullable = true)
 |-- name: string (nullable = false)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)



In [134]:
def IntegerSafe(value):
    try:
        return int(value)
    except ValueError:
        return None 
           

In [135]:
business = business.map(lambda x: (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))

In [136]:
business_df_3 = ss.createDataFrame(business, schema)

In [137]:
business_df_3.show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



# .select() - Choose name and zip

In [138]:
business_df_3.select('name', 'zip').show(5)

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows



In [139]:
business_df_3.select(business_df_3['name'], business_df_3['zip']).show(5)

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows




# .drop() - select except for state and city

In [140]:
business_df_3.drop('state', 'city').show(5) # .drop doesn't delete the dropped columns from the original rdd 

+-----+--------------------+--------------------+
|  zip|                name|              street|
+-----+--------------------+--------------------+
|94123|   Tournahu George L|   3301 Broderick St|
|94124|Stephens Institut...|    2225 Jerrold Ave|
|94105|Stephens Institut...|180 New Montgomer...|
|94108|Stephens Institut...|       540 Powell St|
|94107|Stephens Institut...|     460 Townsend St|
+-----+--------------------+--------------------+
only showing top 5 rows




# .filter() and .where() - Filter where city is San Francisco but state is not CA

In [141]:
business_df_3.where("city == 'San Francisco' and state != 'CA'").show(10)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94134|         Kugay Faruk|301 Executive Par...|San Francisco|     |
|94114|     Crystal Way Inc|      2335 Market St|San Francisco|     |
|94109|          Protopopov|        1190 Pine St|San Francisco|   CO|
|94134|301 Rolph Street ...|       65 Leland Ave|San Francisco|     |
|94127|Nu Greek Wine Co Inc|     225 Moncada Way|San Francisco|   SC|
|94124|Calvary Hill Soci...|   141 Industrial St|San Francisco|     |
|94118|       Leung Clifton|         320 6th Ave|San Francisco|     |
|94109| Chun Jimmy & Shuk Y|  1529 Sacramento St|San Francisco|     |
|94109| Chun Jimmy & Shuk Y|  1354 Sacramento St|San Francisco|     |
|94118|        Leung Olivia|         320 6th Ave|San Francisco|     |
+-----+--------------------+--------------------+-------------+-----+
only showing top 10 


# .withColumnRenamed(existing_col_name, new_col_name) - Change zip to zip code

In [142]:
business_df_3.withColumnRenamed('zip', 'Zip Code').show(5)

+--------+--------------------+--------------------+-------------+-----+
|Zip Code|                name|              street|         city|state|
+--------+--------------------+--------------------+-------------+-----+
|   94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|   94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|   94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|   94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|   94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+--------+--------------------+--------------------+-------------+-----+
only showing top 5 rows



# .withColumn(columnName, columnExpression) - Add the "odd_zip" column to see whether zip is an odd number

In [190]:
business_df_3.withColumn('odd_zip', (business_df_3['zip'] % 2 ==1)).show(5)

+-----+--------------------+--------------------+-------------+-----+-------+
|  zip|                name|              street|         city|state|odd_zip|
+-----+--------------------+--------------------+-------------+-----+-------+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|   true|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|  false|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|   true|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|  false|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|   true|
+-----+--------------------+--------------------+-------------+-----+-------+
only showing top 5 rows



# .orderBy(columns, ascending = True) - Order it by name in ascending/descending order

In [154]:
business_df_3.orderBy('name').show(5)

+-----+-----------------+------+--------------------+-----------------+
|  zip|             name|street|                city|            state|
+-----+-----------------+------+--------------------+-----------------+
|94103|               "1|     2|3 Express Moving ...|31 Duboce Ave 31a|
|94105|      "1-2-3 Deli| Inc."|      123 Mission St|    San Francisco|
|94105|"1055 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94105|"1069 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94109|"1080 Bush Street|  Llc"|        1080 Bush St|    San Francisco|
+-----+-----------------+------+--------------------+-----------------+
only showing top 5 rows



In [155]:
business_df_3.orderBy('name', ascending=False).show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94521|     Zzr Enterprises|5520 Pennsylvania...|      Concord|   CA|
|94121|Zzgor Entertainme...|     447 22nd Ave #3|San+francisco|   CA|
|94104|         Zyzzyva Inc|      57 Post St 604|San Francisco|   CA|
|94118|         Zyzzyva Inc|       44 Almaden Ct|San Francisco|   CA|
|53226|          Zywave Inc|10700 W Research ...|    Milwaukee|   WI|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



In [156]:
# .orderBy(columns, ascending = True) - Order it by name in ascending and zip descending order

In [159]:
business_df_3.orderBy(['zip', 'name'], ascending=[True, False]).show(5)

+----+--------------------+--------------------+------------------+---------+
| zip|                name|              street|              city|    state|
+----+--------------------+--------------------+------------------+---------+
|null|         Woo H Woo S|     155-57 Clara St|     San Francisco|       CA|
|null|Willis Supply Cor...|     1149 Pioneer Rd|       "Burlington|+ontario"|
|null|Vip Plumbing And ...|       4020 Payne Rd|                  |         |
|null|     Vieira Reynaldo|  3584 San Bruno Ave|     San Francisco|       CA|
|null|Viavid Broadcasti...|998 Harbourside D...|North+vancouver+bc|         |
+----+--------------------+--------------------+------------------+---------+
only showing top 5 rows



# Create a dataframe using supervisor_sf.csv and where zip and id are both non-nullable and integertypes.

In [180]:
supervisor = sc.textFile("../Data/supervisor_sf.csv")\
            .map(lambda x: x.split(',')).map(lambda x: (IntegerSafe(x[0]), IntegerSafe(x[1])))

In [181]:
supervisor.first()

(94102, 8)

In [183]:
schema = StructType([StructField('zip', IntegerType(), False),
                        StructField('id', IntegerType(), False)])

In [184]:
supervisor_df = ss.createDataFrame(supervisor, schema)

In [185]:
supervisor_df.show(5)

+-----+---+
|  zip| id|
+-----+---+
|94102|  8|
|94102|  6|
|94102|  3|
|94102|  5|
|94103|  8|
+-----+---+
only showing top 5 rows



# Find supervisor ids for zipcode, 94123

In [186]:
supervisor_df.filter('zip == 94123').select('id').show(5)

+---+
| id|
+---+
|  2|
+---+

