# AirBnb NY Listing Price Prediction: Data Preparation | Step 3

In [0]:
import pandas as pd
import os
from pyspark.sql.functions import *

### Catalog, Schema

In [0]:
## Parameters from cluster variables
catalog_ = os.getenv('CATALOG_NAME')
schema_ = os.getenv('SCHEMA_NAME')

## Parameters from job
#catalog_ = dbutils.widgets.get("CATALOG_NAME")
#schema_ = dbutils.widgets.get("SCHEMA_NAME")

In [0]:
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

### [Gold data] Finalize data preparation for ML

Steps required:

* Remove uninformative columns (same value on all rows or different values on every row, columns with >50% Null values)
* Remove NAs
* Encode required categorical columns


#### 1-Remove Uninformative Columns & NAs

In [0]:
gold1_sdf = spark.sql("SELECT * from airbnb_ny_silver_data")
gold1 = gold1_sdf.toPandas()
#display(gold1.info())

In [0]:
#gold1.describe(percentiles=[.25, .5, .75])

In [0]:
# Drop columns
to_drop = ['name','host_id','house_rules','license','host_name', ## Too many unique values
           'last_review', 'reviews_per_month',  ## Too many null values
           'country','country_code', ## Too few unique values
           'service_fee' ## Unlikely this info is available when we predict the price
           ]
for c in to_drop:
  gold1_sdf = gold1_sdf.drop(c)

gold_step1 = gold1_sdf.toPandas()
#display(gold_step1.info())

In [0]:
# Drop NAs
for c in list(gold1_sdf.columns):
  len_pre = gold1_sdf.count()
  gold1_sdf = gold1_sdf.dropna(subset=c)
  len_post = gold1_sdf.count()
  #if len_pre>len_post:
    #print(":: On col %s, removed %4d NAs" % (c, len_pre - len_post))

#### 2-Encode Categorical Columns

Since we will be using tree based algorithms, we will do an **Ordinal Encoding** and also a **One Hot Encoding**.

In [0]:
gold1_sdf_bkp = gold1_sdf.select("*")
#display(gold1_sdf_bkp.head(1))

In [0]:
gold_step2 = gold1_sdf.toPandas()
col_to_exclude = ['id']
col_ordinal_encoding = {}

## Check values per column
for col_ in gold_step2.columns:
  if gold_step2[col_].dtype == 'object' and col_ not in col_to_exclude:
    gold_groupby = gold_step2.groupby(col_).agg(records=('price','count'), avg_price=('price', 'mean'), std_price=('price', 'std'))
    gold_groupby.reset_index(inplace=True)
    #print(":: %s has %2d unique values" % (col_, len(gold_groupby)))
    gold_groupby.sort_values(by=col_, inplace=True)
    #display(gold_groupby)
    ## Encode unique values
    values_ = list(gold_groupby[col_])
    col_ordinal_encoding[col_] = {categ_value: str(id+1) for id, categ_value in enumerate(values_)} # Make str to avoid mixed type replacement errors
    #print("\n")

##### 2.1 Ordinal Encoding

In [0]:
## Create encoded columns on the dataframe
for k, encod_dict in col_ordinal_encoding.items():
  gold1_sdf = (
    gold1_sdf
    .withColumn(f"{k}_encoded", col(k))
    .replace(to_replace=encod_dict, subset=[k+"_encoded"]) # Map dictionary to a column
    .withColumn(k+"_encoded", col(k+"_encoded").cast('float'))
  )

In [0]:
## Persist encoded column info
gold1_sdf.createOrReplaceTempView("gold_v")

In [0]:
%sql
DROP TABLE IF EXISTS airbnb_ny_gold_data;
CREATE TABLE airbnb_ny_gold_data AS
Select 
  id
  ,cast(host_identity_verified_encoded as float) as host_identity_verified_encoded
  ,cast(neighbourhood_group_encoded as float) as neighbourhood_group_encoded
  ,cast(neighbourhood_encoded as float) as neighbourhood_encoded
  ,cast(lat as float)
  ,cast(long as float)
  ,cast(instant_bookable_encoded as float) as instant_bookable_encoded
  ,cast(cancellation_policy_encoded as float) as cancellation_policy_encoded
  ,cast(room_type_encoded as float) as room_type_encoded
  ,cast(construction_year as float) construction_year
  ,cast(minimum_nights as float) minimum_nights
  ,cast(number_of_reviews as float) number_of_reviews
  ,cast(review_rate_number as float) review_rate_number
  ,cast(calculated_host_listings_count as float) calculated_host_listings_count
  ,cast(availability_365 as float) availability_365
  ,cast(price as float) price
  ,cast(log(price) as float) price_log
from gold_v;

In [0]:
%sql
-- SELECT * from gold_data limit 10;

##### 2.2-OneHotEncoding

In [0]:
gold1_sdf_bkp.createOrReplaceTempView("gold_v")

In [0]:
%sql
-- Approach used for the one hot encoding
DROP TABLE IF EXISTS airbnb_ny_gold_data_ohe;
CREATE TABLE airbnb_ny_gold_data_ohe AS
Select 
  id
  ,cast(case when host_identity_verified = 'verified' then 1 else 0 end as float) as is_host_identity_verified
  /* Neighborhood groups = Bronx, Brooklyn, Manhattan, Queens, Staten Island */
  ,cast(case when neighbourhood_group = 'Bronx' then 1 else 0 end as float) is_neighbourhood_group_Bronx
  ,cast(case when neighbourhood_group = 'Brooklyn' then 1 else 0 end as float) is_neighbourhood_group_Brooklyn
  ,cast(case when neighbourhood_group = 'Manhattan' then 1 else 0 end as float) is_neighbourhood_group_Manhattan
  ,cast(case when neighbourhood_group = 'Queens' then 1 else 0 end as float) is_neighbourhood_group_Queens
  /* dropping neighbourhood column */
  ,cast(lat as float)
  ,cast(long as float)
  ,cast(case when instant_bookable = 'TRUE' then 1 else 0 end as float) is_instant_bookable
  /* cancellation_policy = flexible, moderate, strict */
  ,cast(case when cancellation_policy = 'flexible' then 1 else 0 end as float) is_cancellation_policy_flexible
  ,cast(case when cancellation_policy = 'strict' then 1 else 0 end as float) is_cancellation_policy_strict
  /* room_type = Entire home/apt, Hotel room, Private room, Shared room */
  ,cast(case when room_type = 'Entire home/apt' then 1 else 0 end as float) is_room_type_Entire
  ,cast(case when room_type = 'Private room' then 1 else 0 end as float) is_room_type_Privateroom
  ,cast(case when room_type = 'Shared room' then 1 else 0 end as float) is_room_type_Sharedroom
  ,cast(construction_year as float) construction_year
  ,cast(minimum_nights as float) minimum_nights
  ,cast(number_of_reviews as float) number_of_reviews
  ,cast(review_rate_number as float) review_rate_number
  ,cast(calculated_host_listings_count as float) calculated_host_listings_count
  ,cast(availability_365 as float) availability_365
  ,cast(price as float) price
  ,cast(log(price) as float) price_log
from gold_v;

### Final counts

In [0]:
display(spark.sql("select count(*) from airbnb_ny_bronze_data;"))
display(spark.sql("select count(*) from airbnb_ny_silver_data;"))
display(spark.sql("select count(*) from airbnb_ny_gold_data;"))
display(spark.sql("select count(*) from airbnb_ny_gold_data_ohe;"))