# Michelin Restaurant Dataset: Preparation

The dataset comes from [Kaggle](https://www.kaggle.com/datasets/ngshiheng/michelin-guide-restaurants-2021)

In [0]:
import pandas as pd
import os
from pyspark import SparkFiles
from pyspark.sql.functions import *

### Catalog, Schema Creation

In [0]:
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')
data_path_ = os.getenv('DATA_PATH')

In [0]:
spark.sql("CREATE CATALOG IF NOT EXISTS "+catalog_)
spark.sql("CREATE SCHEMA IF NOT EXISTS "+catalog_+"."+schema_)

In [0]:
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

### Data Ingestion from Volume [Bronze]

In [0]:
# Download the file to a volume
import urllib

volume_ = catalog_+'.'+schema_+'.init'
spark.sql("CREATE VOLUME IF NOT EXISTS "+volume_)
urllib.request.urlretrieve(
  data_path_,
  "/Volumes/"+catalog_+"/"+schema_+"/init/michelin_data.csv")
filepath_ = f"dbfs:/Volumes/"+catalog_+"/"+schema_+"/init/michelin_data.csv"


In [0]:
bronze_ = (spark.
           read.
           option("header", True).
           option("delimiter", ",").
           option("encoding", "utf-8").
           format("csv").
           load(filepath_).
           createOrReplaceTempView("bronze_v"))

display(spark.sql("SELECT * FROM bronze_v LIMIT 10;"))


In [0]:
%sql
DROP TABLE IF EXISTS bronze_data;
CREATE TABLE bronze_data AS
  select * from bronze_v;

### Data Exploration

In [0]:
bronze_df = spark.sql("SELECT * FROM bronze_data")
display(dbutils.data.summarize(bronze_df))

In [0]:
%sql
-- -- -- Price
select Price, len(Price)::int price_score, count(*) from bronze_data group by 1,2 order by 1,2;

In [0]:
%sql
select * from bronze_data where Price is null;

In [0]:
%sql
-- -- -- Award
select Award, coalesce(substring(Award, 0,1)::int, 0::int) Stars_score, count(*) from bronze_data group by 1,2 order by 1,2;


In [0]:
%sql
-- -- -- Quality Ratio
select coalesce(substring(Award, 0,1)::float, -1::float) Award_score
  , len(Price)::float Price_score
  , coalesce(substring(Award, 0,1)::float, -1::float)/len(Price)::float Quality
from bronze_data
where Price is not null
group by 1,2 order by 1,2;

In [0]:
%sql
-- -- -- Cuisine
select cuisine, count(*) from bronze_data group by 1 order by 1;

### Data Cleaning [Silver]

In [0]:
%sql
DROP TABLE IF EXISTS silver_data;
CREATE TABLE silver_data AS
  select 
    concat('res-id-', cast(100000+row_number() over(order by Longitude::float, Latitude::float, `Location`, `Name`) as string)) Res_ID,
    `Name`, Address, `Location`, Longitude::float, Latitude::float, PhoneNumber, `Url` as MichelineUrl, WebsiteUrl,
    Cuisine, Price, len(Price)::int Price_score,
    Award, coalesce(substring(Award, 0,1)::int, 0::int) Stars_score,
    coalesce(substring(Award, 0,1)::float, 0::float)/len(Price)::float Quality_ratio,
    GreenStar::int,
    FacilitiesAndServices, Description
  from bronze_data
  where 
    Price is not null -- Removing one Japanese restaurant
  ;
