In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))

supervisor = sc.textFile("../Data/SF_business/supervisor_sf.csv")\
               .map(lambda x : x.split(","))\
               .map(lambda x: (IntegerSafe(x[0]), IntegerSafe(x[1])))
              

business_schema = StructType([ StructField("zip", IntegerType(), True),
                               StructField("name", StringType(), False),
                               StructField("street", StringType(), True),
                               StructField("city", StringType(), True),
                               StructField("state", StringType(), True)
                            ])

supervisor_schema = StructType([ StructField("zip", IntegerType(), False),
                    StructField("id", IntegerType(), False)
                    ])

business_df = ss.createDataFrame(business, business_schema)
supervisor_df = ss.createDataFrame(supervisor, supervisor_schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/17 05:16:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/17 05:16:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Save Supservisor DataFrame as “Supervisor” and Business DataFrame as “Business”.

In [3]:
business_df.write.option(
    "path", "/Users/dwoodbridge/Class/2022_MSDS697/Data/Week2/Business").saveAsTable('Business')

                                                                                

In [5]:
supervisor_df.write.option(
    "path", "/Users/dwoodbridge/Class/2022_MSDS697/Data/Week2/Supervisor").saveAsTable('Supervisor')

## Kill the Spark

In [6]:
ss.stop()

## Re-read registered table 

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

ss = SparkSession.builder.getOrCreate()

22/02/17 05:18:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [8]:
ss.sql("select * from parquet.`/Users/dwoodbridge/Class/2022_MSDS697/Data/Week2/Business`").show()

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94105| Barney & Barney Llc|1 Market St Steua...|San Francisco|   CA|
|94109|   Holbert Deneice M|  1426 California St|San Francisco|   CA|
| 6002|      Integralis Inc|310 West Newberry Rd|   Bloomfield|   CT|
|95603|       Mcadams Pat G|  10279 Mt Vernon Rd|       Auburn|   CA|
|95685|Young Gregory You...|14508 Shake Ridge Rd| Sutter+creek|   CA|
|95037|The Backflow Guy Inc|305 Vineyard Town...|  Morgan+hill|   CA|
|94103|    W J Britton & Co|     1345 Mission St|San Francisco|   CA|
|94112|    W J Britton & Co|     1000 Geneva Ave|San Francisco|   CA|
|94103|    W J Britton & Co|      1618 Howard St|San Francisco|   CA|
|94114|    W J Britton & Co|      2378 Market St|San Francisco|   CA|
|94110|    W J Britton & Co|3351 Cesar Chavez St|San Francisco|   CA|
|94112|    W J Britt

In [9]:
ss.sql("select * from parquet.`/Users/dwoodbridge/Class/2022_MSDS697/Data/Week2/Supervisor`").show()

+-----+---+
|  zip| id|
+-----+---+
|94115|  5|
|94116|  7|
|94116|  4|
|94117|  1|
|94117|  7|
|94117|  8|
|94117|  5|
|94118|  2|
|94118|  1|
|94118|  5|
|94121|  2|
|94121|  1|
|94122|  1|
|94122|  7|
|94122|  5|
|94122|  4|
|94123|  2|
|94124|  9|
|94124| 10|
|94127|  7|
+-----+---+
only showing top 20 rows



In [10]:
ss.stop()