# Read and Store Data

## Using Spark on Yelp Data

In [16]:
import pyspark as ps    # for the pyspark suite
spark = (ps.sql.SparkSession
         .builder
         .master('local[4]')
         .appName('yelp')
         .getOrCreate()
        )
sc = spark.sparkContext

### Get Yelp business.json Schema

In [17]:
# read JSON
yelp_business_df = spark.read.json('data/yelp_academic_dataset_business.json')

# prints the schema
yelp_business_df.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [22]:
yelp_business_df.select("city").distinct().count()

1251

In [None]:
expr = "Arizona.*hot"
dk = dx.filter(dx["keyword"].rlike(expr))

In [30]:
yelp_business_pd_df = yelp_business_df.toPandas()

In [31]:
yelp_business_pd_df['city'].unique()

array(['Cornelius', 'Scottsdale', 'Montreal', ..., 'ARSENAL', 'Chander',
       'Tempe '], dtype=object)

### Get Yelp review.json Schema

In [18]:
# read JSON
yelp_review_df = spark.read.json('data/yelp_academic_dataset_review.json')

# prints the schema
yelp_review_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- cool: long (nullable = true)
 |-- date: string (nullable = true)
 |-- funny: long (nullable = true)
 |-- review_id: string (nullable = true)
 |-- stars: double (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- user_id: string (nullable = true)



## Join Yelp business and review data Filter for SF

In [19]:
yelp_joined_sf_df = yelp_business_df.join(yelp_review_df, yelp_business_df.business_id == yelp_review_df.business_id).filter
.printSchema()

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

## Using Pandas on SF Restaurant Health Inspection Data

In [20]:
import pandas as pd

### Get SF Restaurant Scores .csv Schema

In [11]:
scores_df = pd.read_csv('data/Restaurant_Scores_-_LIVES_Standard.csv')

In [15]:
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53973 entries, 0 to 53972
Data columns (total 23 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   business_id                53973 non-null  int64  
 1   business_name              53973 non-null  object 
 2   business_address           53973 non-null  object 
 3   business_city              53973 non-null  object 
 4   business_state             53973 non-null  object 
 5   business_postal_code       52787 non-null  object 
 6   business_latitude          27475 non-null  float64
 7   business_longitude         27475 non-null  float64
 8   business_location          27475 non-null  object 
 9   business_phone_number      17672 non-null  float64
 10  inspection_id              53973 non-null  object 
 11  inspection_date            53973 non-null  object 
 12  inspection_score           39541 non-null  float64
 13  inspection_type            53973 non-null  obj