In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))
              

schema = StructType([ StructField("zip", IntegerType(), True),
                      StructField("name", StringType(), False),
                      StructField("street", StringType(), True),
                      StructField("city", StringType(), True),
                      StructField("state", StringType(), True)
                    ])

business_df = ss.createDataFrame(business, schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/15 11:52:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/15 11:52:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/15 11:52:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## .select() - Choose name and zip

In [2]:
business_df.select("name", "zip").show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows



                                                                                

In [3]:
business_df.select(business_df["name"], business_df["zip"]).show(5)

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows



#### does the original dataframe change?

In [4]:
business_df.show(10)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|
|94102|Stephens Institut...|       620 Sutter St|San Francisco|   CA|
|94102|Stephens Institut...|       655 Sutter St|San Francisco|   CA|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|
|94107|Stephens Institut...|    121 Wisconsin St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 10 

## .drop() - select except for state and city

In [5]:
business_df.drop("state").show(5)

+-----+--------------------+--------------------+-------------+
|  zip|                name|              street|         city|
+-----+--------------------+--------------------+-------------+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|
|94108|Stephens Institut...|       540 Powell St|San Francisco|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|
+-----+--------------------+--------------------+-------------+
only showing top 5 rows



In [4]:
business_df.drop("state","city").show(5)

+-----+--------------------+--------------------+
|  zip|                name|              street|
+-----+--------------------+--------------------+
|94123|   Tournahu George L|   3301 Broderick St|
|94124|Stephens Institut...|    2225 Jerrold Ave|
|94105|Stephens Institut...|180 New Montgomer...|
|94108|Stephens Institut...|       540 Powell St|
|94107|Stephens Institut...|     460 Townsend St|
+-----+--------------------+--------------------+
only showing top 5 rows



In [7]:
business_df.show()

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|
|94102|Stephens Institut...|       620 Sutter St|San Francisco|   CA|
|94102|Stephens Institut...|       655 Sutter St|San Francisco|   CA|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|
|94107|Stephens Institut...|    121 Wisconsin St|San Francisco|   CA|
|94102|Stephens Institut...|        150 Hayes St|San Francisco|   CA|
|94133|Stephens Inst

## .filter() and .where() - Filter where city is San Francisco but state is not CA

In [8]:
business_df.filter("zip == 94123").show(5)

+-----+--------------------+-----------------+-------------+-----+
|  zip|                name|           street|         city|state|
+-----+--------------------+-----------------+-------------+-----+
|94123|   Tournahu George L|3301 Broderick St|San Francisco|   CA|
|94123|        Amore Robert|    1958 Union St|San Francisco|   CA|
|94123|Aunt Anns Corp He...|    2722 Gough St|San Francisco|   CA|
|94123|Barbagelata & Co Inc| 2381 Chestnut St|San Francisco|   CA|
|94123|Boas Internationa...|  2098 Lombard St|San Francisco|   CA|
+-----+--------------------+-----------------+-------------+-----+
only showing top 5 rows



In [6]:
business_df.filter("city == 'San Francisco' and state != 'CA'").show(10)

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94134|         Kugay Faruk|301 Executive Par...|San Francisco|     |
|94114|     Crystal Way Inc|      2335 Market St|San Francisco|     |
|94109|          Protopopov|        1190 Pine St|San Francisco|   CO|
|94134|301 Rolph Street ...|       65 Leland Ave|San Francisco|     |
|94127|Nu Greek Wine Co Inc|     225 Moncada Way|San Francisco|   SC|
|94124|Calvary Hill Soci...|   141 Industrial St|San Francisco|     |
|94118|       Leung Clifton|         320 6th Ave|San Francisco|     |
|94109| Chun Jimmy & Shuk Y|  1529 Sacramento St|San Francisco|     |
|94109| Chun Jimmy & Shuk Y|  1354 Sacramento St|San Francisco|     |
|94118|        Leung Olivia|         320 6th Ave|San Francisco|     |
+-----+--------------------+--------------------+-------------+-----+
only showing top 10 

## .withColumnRenamed(existing_col_name, new_col_name)  - Change zip to zip code

In [10]:
business_df.withColumnRenamed('zip','zip code').show(5)

+--------+--------------------+--------------------+-------------+-----+
|zip code|                name|              street|         city|state|
+--------+--------------------+--------------------+-------------+-----+
|   94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|   94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|   94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|   94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|   94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+--------+--------------------+--------------------+-------------+-----+
only showing top 5 rows



## .withColumn(columnName, columnExpression)  - Add the "odd_zip" column to see whether zip is an odd number

In [11]:
business_df.withColumn('odd_zip', business_df['zip'] % 2 ).show(5)

+-----+--------------------+--------------------+-------------+-----+-------+
|  zip|                name|              street|         city|state|odd_zip|
+-----+--------------------+--------------------+-------------+-----+-------+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|      1|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|      0|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|      1|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|      0|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|      1|
+-----+--------------------+--------------------+-------------+-----+-------+
only showing top 5 rows



In [12]:
business_df.withColumn('odd_zip', business_df['zip'] % 2 == 1 ).show(5)

+-----+--------------------+--------------------+-------------+-----+-------+
|  zip|                name|              street|         city|state|odd_zip|
+-----+--------------------+--------------------+-------------+-----+-------+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|   true|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|  false|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|   true|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|  false|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|   true|
+-----+--------------------+--------------------+-------------+-----+-------+
only showing top 5 rows



## .orderBy(columns, ascending = True) - Order it by name in ascending/descending order

In [13]:
business_df.orderBy("name").show(5)

[Stage 12:>                                                         (0 + 2) / 2]

+-----+-----------------+------+--------------------+-----------------+
|  zip|             name|street|                city|            state|
+-----+-----------------+------+--------------------+-----------------+
|94103|               "1|     2|3 Express Moving ...|31 Duboce Ave 31a|
|94105|      "1-2-3 Deli| Inc."|      123 Mission St|    San Francisco|
|94105|"1055 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94105|"1069 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94109|"1080 Bush Street|  Llc"|        1080 Bush St|    San Francisco|
+-----+-----------------+------+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [14]:
business_df.orderBy("name", ascending = False).show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94521|     Zzr Enterprises|5520 Pennsylvania...|      Concord|   CA|
|94121|Zzgor Entertainme...|     447 22nd Ave #3|San+francisco|   CA|
|94104|         Zyzzyva Inc|      57 Post St 604|San Francisco|   CA|
|94118|         Zyzzyva Inc|       44 Almaden Ct|San Francisco|   CA|
|53226|          Zywave Inc|10700 W Research ...|    Milwaukee|   WI|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



[Stage 13:>                                                         (0 + 2) / 2]                                                                                

In [15]:
business_df.sort("name").show(5)

[Stage 14:>                                                         (0 + 2) / 2]

+-----+-----------------+------+--------------------+-----------------+
|  zip|             name|street|                city|            state|
+-----+-----------------+------+--------------------+-----------------+
|94103|               "1|     2|3 Express Moving ...|31 Duboce Ave 31a|
|94105|      "1-2-3 Deli| Inc."|      123 Mission St|    San Francisco|
|94105|"1055 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94105|"1069 Pine Street|  Llc"|79 New Montgomery St|    San Francisco|
|94109|"1080 Bush Street|  Llc"|        1080 Bush St|    San Francisco|
+-----+-----------------+------+--------------------+-----------------+
only showing top 5 rows



                                                                                

In [16]:
ss.stop()