# AirBnb NY Listing Price Prediction: Data Preparation | Step 2

In [0]:
import pandas as pd
import os
from pyspark.sql.functions import *

### Catalog, Schema

In [0]:
## Parameters from cluster variables
#catalog_ = os.getenv('CATALOG_NAME')
#schema_ = os.getenv('SCHEMA_NAME')

## Parameters from job
catalog_ = dbutils.widgets.get("CATALOG_NAME")
schema_ = dbutils.widgets.get("SCHEMA_NAME")

In [0]:
spark.sql("USE CATALOG "+catalog_)
spark.sql("USE SCHEMA "+schema_)

### Data Cleaning [Silver]

Cleaning steps required:
* Remove duplicates
* Remove invalid characters from price column (e.g., $ ,)
* Remove invalid availabilities (e.g., 0 or >365)

#### 1,2-Remove Duplicates & Convert Column Formats

-- Duplicates
`with d0 (
select distinct * from bronze_data)
select "Tot Records" as count, count(*) value_ from bronze_data
  union select "Unique Records" as count, count(*) value_ from d0;`

In [0]:
%sql
DROP TABLE IF EXISTS airbnb_ny_silver_data;
CREATE TABLE airbnb_ny_silver_data AS
with d0 as (
  -- Remove duplicates and initial cleaning
  select distinct *
  from airbnb_ny_bronze_data
  where (host_identity_verified in ('verified', 'unconfirmed') or host_identity_verified is null) 
  and availability_365 is not null
  and id not in (' Laundry in building  8 mints from A C trains')
  and construction_year is not null
  and minimum_nights is not null and minimum_nights not like '$%'
  and number_of_reviews is not null and number_of_reviews not like '$%'
  and review_rate_number is not null and review_rate_number not like '$%' and review_rate_number not like '%/%'
  and calculated_host_listings_count is not null and calculated_host_listings_count not like '%.%'
  -- Removing a few problematic data
), d1 as (
  -- Clean up some fields
  select id,	`name`,	host_id,	host_identity_verified,	host_name
  ,case when neighbourhood_group = 'brookln' then 'Brooklyn' when neighbourhood_group = 'manhatan' then 'Manhattan' else neighbourhood_group end as neighbourhood_group
  ,	neighbourhood,	lat,	long,	country,	country_code, instant_bookable, cancellation_policy, room_type, construction_year, price, service_fee
  ,cast(case when abs(minimum_nights)>365 then 365 else abs(minimum_nights) end as int) as minimum_nights
  , number_of_reviews, last_review,	reviews_per_month,	review_rate_number, calculated_host_listings_count
  ,cast(case when abs(availability_365)>365 then 365 else abs(availability_365) end as int) as availability_365
  , house_rules, license
  from d0
)
-- Clean and convert numerical fields
  select
    cast(id as string) id
    ,`name`
    ,cast(trim(host_id) as string) host_id
    ,host_identity_verified, host_name, neighbourhood_group, neighbourhood
    ,cast(lat as float) lat
    ,cast(long as float) long
    ,country,	country_code, instant_bookable, cancellation_policy, room_type
    ,cast(construction_year as int) construction_year
    ,cast(minimum_nights as int) minimum_nights
    ,cast(number_of_reviews as int) number_of_reviews
    ,to_timestamp(trim(last_review), 'M/d/yyyy') last_review
    ,cast(reviews_per_month as float) reviews_per_month
    ,cast(review_rate_number as int) review_rate_number
    ,cast(calculated_host_listings_count as int) calculated_host_listings_count
    ,cast(availability_365 as int) availability_365
    ,house_rules, license
    ,cast(
      replace(
        replace(price, '$', ''),
        ',', '')
      as float) price
    ,cast(
      replace(
        replace(service_fee, '$', ''),
        ',', '')
        as float) service_fee
from d1;