In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))

supervisor = sc.textFile("../Data/SF_business/supervisor_sf.csv")\
               .map(lambda x : x.split(","))\
               .map(lambda x: (IntegerSafe(x[0]), IntegerSafe(x[1])))
              

business_schema = StructType([ StructField("zip", IntegerType(), True),
                               StructField("name", StringType(), False),
                               StructField("street", StringType(), True),
                               StructField("city", StringType(), True),
                               StructField("state", StringType(), True)
                            ])

supervisor_schema = StructType([ StructField("zip", IntegerType(), False),
                    StructField("id", IntegerType(), False)
                    ])

business_df = ss.createDataFrame(business, business_schema)
supervisor_df = ss.createDataFrame(supervisor, supervisor_schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/17 05:11:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/17 05:11:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## inner joins

In [2]:
business_df.join(supervisor_df, 'zip').show(5)



+-----+--------------------+--------------------+-------------+-----+---+
|  zip|                name|              street|         city|state| id|
+-----+--------------------+--------------------+-------------+-----+---+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  2|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  6|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  3|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  5|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|  2|
+-----+--------------------+--------------------+-------------+-----+---+
only showing top 5 rows



                                                                                

In [3]:
business_df.join(supervisor_df, business_df['zip'] == supervisor_df['zip']).show(5)

[Stage 5:>                                                          (0 + 2) / 2]

+-----+--------------------+--------------------+-------------+-----+-----+---+
|  zip|                name|              street|         city|state|  zip| id|
+-----+--------------------+--------------------+-------------+-----+-----+---+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|94109|  2|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|94109|  6|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|94109|  3|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|94109|  5|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|94109|  2|
+-----+--------------------+--------------------+-------------+-----+-----+---+
only showing top 5 rows





In [4]:
business_df.join(supervisor_df, 'zip', 'inner').show(5)

[Stage 10:>                                                         (0 + 2) / 2]

+-----+--------------------+--------------------+-------------+-----+---+
|  zip|                name|              street|         city|state| id|
+-----+--------------------+--------------------+-------------+-----+---+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  2|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  6|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  3|
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|  5|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|  2|
+-----+--------------------+--------------------+-------------+-----+---+
only showing top 5 rows



                                                                                

## outer joins

In [5]:
business_df.join(supervisor_df, 'zip', 'left_outer').show(5)

+-----+--------------------+--------------------+-------------+-----+----+
|  zip|                name|              street|         city|state|  id|
+-----+--------------------+--------------------+-------------+-----+----+
| 6002|      Integralis Inc|310 West Newberry Rd|   Bloomfield|   CT|null|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|   6|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|   3|
|94105| Barney & Barney Llc|1 Market St Steua...|San Francisco|   CA|   6|
|94105| Barney & Barney Llc|1 Market St Steua...|San Francisco|   CA|   3|
+-----+--------------------+--------------------+-------------+-----+----+
only showing top 5 rows



In [6]:
business_df.join(supervisor_df, 'zip', 'right_outer').show(5)

[Stage 20:>                                                         (0 + 2) / 2]

+-----+--------------------+------------------+-------------+-----+---+
|  zip|                name|            street|         city|state| id|
+-----+--------------------+------------------+-------------+-----+---+
|94115|Okamura Ricky & R...|  1747 Buchanan St|San Francisco|   CA|  5|
|94115| Brooks Brothers Inc|  2223 Fillmore St|San Francisco|   CA|  5|
|94115|      Cournale Clyde|1755 Mcallister St|San Francisco|   CA|  5|
|94115|          Jo Pos Inc|  2200 Fillmore St|San Francisco|   CA|  5|
|94115|Del Camp Investme...|1544 Mcallister St|San Francisco|   CA|  5|
+-----+--------------------+------------------+-------------+-----+---+
only showing top 5 rows



                                                                                

## leftsemi join

In [7]:
business_df.join(supervisor_df, 'zip', 'leftsemi').show(5)

[Stage 25:>                                                         (0 + 2) / 2]

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94109|Stephens Institut...|1835-49 Van Ness Ave|San Francisco|   CA|
|94109|Stephens Institut...|        1055 Pine St|San Francisco|   CA|
|94109|     Alioto F Co Inc|    440 Jefferson St|San Francisco|   CA|
|94109|     Haines Robert D|   786-792 Sutter St|San Francisco|   CA|
|94109|Avis Rent A Car S...|         675 Post St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



                                                                                

In [8]:
ss.stop()