In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

def IntegerSafe(value): # In case there are non-integer type to be converted.
    try:
        return int(value)
    except:
        return None

ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext


business = sc.textFile("../Data/SF_business/filtered_registered_business_sf.csv")\
             .map(lambda x : x.split(','))\
             .map(lambda x : (IntegerSafe(x[0]), x[1], x[2], x[3], x[4]))
              

schema = StructType([ StructField("zip", IntegerType(), True),
                      StructField("name", StringType(), False),
                      StructField("street", StringType(), True),
                      StructField("city", StringType(), True),
                      StructField("state", StringType(), True)
                    ])

business_df = ss.createDataFrame(business, schema)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/17 05:00:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create a UDF called check_sf which checks whether a given value contains “San Francisco” or “SF”.


In [2]:
from pyspark.sql.functions import *
check_sf = udf(lambda x : ("San Francisco" in x) or ("SF" in x))

In [3]:
business_df.select('name', check_sf('city')).show()

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------+
|                name|<lambda>(city)|
+--------------------+--------------+
|   Tournahu George L|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
|Stephens Institut...|          true|
+--------------------+--------------+
only showing top 20 rows



                                                                                

In [4]:
business_df.select('name', check_sf('city')).printSchema()

root
 |-- name: string (nullable = false)
 |-- <lambda>(city): string (nullable = true)



In [5]:
def sf(x):
    if (("San Francisco" in x) or ("SF" in x)):
        return True
    else :
        return False

In [6]:
business_df.select('name', sf('city')).show()

TypeError: Invalid argument, not a string or column: False of type <class 'bool'>. For column literals, use 'lit', 'array', 'struct' or 'create_map' function.

In [7]:
check_sf_using_sf = udf(sf, BooleanType())
business_df.select('name', check_sf_using_sf('city')).show()

+--------------------+--------+
|                name|sf(city)|
+--------------------+--------+
|   Tournahu George L|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
|Stephens Institut...|    true|
+--------------------+--------+
only showing top 20 rows



In [8]:
business_df.select('name', check_sf_using_sf('city')).printSchema()

root
 |-- name: string (nullable = false)
 |-- sf(city): boolean (nullable = true)



## Apply check_sf to business_df to check whether city is San Francisco.


In [9]:
business_df.select('name', 'city', check_sf('city').alias('San Francisco')).orderBy('zip').show()

[Stage 2:>                                                          (0 + 2) / 2]

+--------------------+-------------+-------------+
|                name|         city|San Francisco|
+--------------------+-------------+-------------+
|Vip Plumbing And ...|             |        false|
|C Fischer And Son...|San Francisco|         true|
|Intelex Technolog...|     "Toronto|        false|
|Miniclip America Inc|      "London|        false|
|East & West Alum ...|      Burnaby|        false|
|        Ortiz Jose E|San Francisco|         true|
|Law Office Of Sco...|             |        false|
|          Htut Chris|             |        false|
|         Lexa Mary C|      Oakland|        false|
|       Act Fuels Inc|   "Amsterdam|        false|
|     Vieira Reynaldo|San Francisco|         true|
|      Pointclickcare| "Mississauga|        false|
|Margaret Apartmen...|San Francisco|         true|
|          Malik Alia|San Francisco|         true|
|       Torres Alvaro|    Daly+city|        false|
|         Odotech Inc|    "Montreal|        false|
|   Magdaluyo Melecio|        9

                                                                                

In [10]:
ss.stop()