## Step 1.  Create an RDD using filtered_registered_business_sf.csv seperated by commas

In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv").map(lambda x : x.split(','))
business.first()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/14 15:18:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

['94123', 'Tournahu George L', '3301 Broderick St', 'San Francisco', 'CA']

## Step 2.  (option 1) Convert RDD to DataFrame using .toDF()

In [2]:
from pyspark.sql import SparkSession
ss = SparkSession.builder.getOrCreate()

In [3]:
ss

In [4]:
ss.sparkContext

In [5]:
business_df = business.toDF()

In [None]:
business_df.show()

In [6]:
business_df.show(5)

+-----+--------------------+--------------------+-------------+---+
|   _1|                  _2|                  _3|           _4| _5|
+-----+--------------------+--------------------+-------------+---+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco| CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco| CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco| CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco| CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco| CA|
+-----+--------------------+--------------------+-------------+---+
only showing top 5 rows



In [7]:
business_df.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)
 |-- _5: string (nullable = true)



## Step 2.  (option 2)  Add column names using a list of strings when creating a dataframe

In [8]:
names = ['zip', 'name', 'street', 'city', 'state']

In [9]:
business_df = business.toDF(names)

In [10]:
business_df.show(5)

+-----+--------------------+--------------------+-------------+-----+
|  zip|                name|              street|         city|state|
+-----+--------------------+--------------------+-------------+-----+
|94123|   Tournahu George L|   3301 Broderick St|San Francisco|   CA|
|94124|Stephens Institut...|    2225 Jerrold Ave|San Francisco|   CA|
|94105|Stephens Institut...|180 New Montgomer...|San Francisco|   CA|
|94108|Stephens Institut...|       540 Powell St|San Francisco|   CA|
|94107|Stephens Institut...|     460 Townsend St|San Francisco|   CA|
+-----+--------------------+--------------------+-------------+-----+
only showing top 5 rows



In [11]:
business_df.printSchema()

root
 |-- zip: string (nullable = true)
 |-- name: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)



## Extra - create a data frame with name and zip code only.

In [12]:
business_df = business.map(lambda x : (x[1], x[0])).toDF(['name','zip'])

In [13]:
business_df.show(5)

+--------------------+-----+
|                name|  zip|
+--------------------+-----+
|   Tournahu George L|94123|
|Stephens Institut...|94124|
|Stephens Institut...|94105|
|Stephens Institut...|94108|
|Stephens Institut...|94107|
+--------------------+-----+
only showing top 5 rows



In [14]:
business_df.printSchema()

root
 |-- name: string (nullable = true)
 |-- zip: string (nullable = true)



In [15]:
ss.stop()