# Capstone: Airbnb Price Listing Prediction
## Part 5 Production Model

_Authors: Evonne Tham_

___Example___

_In this notebook, I will calculate the r2 score and RMSE from the production model on the recombined train and test set. I will then fit the same model to obtain the predicted sale prices. I will also state the insights gleaned from the model and subsequent business recommendations_

## Contents of this notebook
- [1. Import Necessary Libraries and Load Data](#1.-Import-Necessary-Libraries-and-Load-Data)
- [2. Model Prep](#2.-Model-Prep)
- [3. Fitting and Predicting XGB Production Model](#3.-Fitting-and-Predicting-XGB-Production-Model)
- [4. Model Evaluation](#4.-Model-Evaluation) 


## 1. Import Necessary Libraries & Load Data </span>

In [None]:
import pandas as pd
import numpy as np
from geopy.distance import great_circle
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# modelling
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression, ElasticNetCV
from sklearn.svm import SVR
from xgboost import XGBRegressor

#Hide warnings
import warnings
warnings.filterwarnings('ignore')

---
##  2. Function for Data Cleaning and Feature Engineering

In [None]:
def distance_to(from_lat, from_lon, to_lat, to_lon):
    to = (to_lat, to_lon)
    accommodation = (from_lat, from_lon)
    return great_circle(to, accommodation).km

In [4]:
def predict_tokyo_price_prod(df_in):
    
    """
    Takes in dataframe, which will select features that are needed for price prediction. 
    Columns will be cleaned, Features are engineered in this function before prediction. 
    """
    # Features Selection    
    include_features = ['host_since','host_response_time','host_response_rate','host_acceptance_rate',
                        'host_is_superhost','host_listings_count','host_identity_verified','neighbourhood_cleansed',
                        'is_location_exact','room_type','property_type','accommodates',
                        'bathrooms','bedrooms','beds',
                        'amenities','security_deposit','cleaning_fee','guests_included',
                        'extra_people','minimum_nights','maximum_nights','availability_90',
                        'number_of_reviews','review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
                        'review_scores_checkin','review_scores_communication','review_scores_location',
                        'review_scores_value','instant_bookable','cancellation_policy']
    df = df_in[include_features]
    
    ## GENERAL CLEANING PROCESS 
    # Parse date-time string to datetime object
    if 'date' in df.columns: 
        df['date'] = pd.to_datetime(df['date'])
        
    # Replace columns with Ts and Fs
    df.replace({'f': 0, 't': 1}, inplace=True)
    
    # Clean Currency
    df = df.applymap(lambda x: int(x.replace('$','').replace(',','').replace('.00','')) 
                     if (type(x) == str) and (x[0]=='$') else x )
    
    
    
    
    ## SPECIFIC CLEANING PROCESS 
    # Create Bins for columns start with review_scores_
    for col in df.columns:
        if (col in df.columns[df.columns.str.startswith('review_scores_')]) and not (col in df.columns[listidng.columns.str.endswith('_rating')]):
            # Create Categories with Labels
            bins = [0,3,7,8,9,10]
            labels = ['0-3','4-7','8','9','10']
            
            df[col] = pd.cut(df[col], 
                             bins=bins, 
                             labels=labels, 
                             include_lowest= True)
            
            # Add unrated Category for 'NaN'
            df[col] = (df[col].cat.add_categories('unrated').fillna('unrated'))
        
        
    # Create Categories for review_score_rating
    bins = [0, 50, 90, 99, 100]
    labels = ['0-49%', '50-89%', '90-99%', '100%']
    df['review_scores_rating'] = pd.cut(df['review_scores_rating'],
                                        bins=bins, 
                                        labels=labels, 
                                        include_lowest= True)
    df['review_scores_rating'] = (df['review_scores_rating'].cat.add_categories('unrated').fillna('unrated'))
    
    
    # Impute, clean and categorize host_response_time and host_response_rate
    df.host_response_time.fillna("unknown", inplace=True)
    
    df.host_response_rate = df.host_response_rate.str.replace('%','')
    df.host_response_rate = df.host_response_rate.astype('float64')
    # Create Categories
    bins = [0, 50, 90, 99, 100]
    labels = ['0-49%', '50-89%', '90-99%', '100%']
    df['host_response_rate'] = pd.cut(df['host_response_rate'],
                                      bins=bins, 
                                      labels=labels, 
                                      include_lowest= True)
    # Create Unknown Category for 'NaN'
    df['host_response_rate'] = (df['host_response_rate'].cat.add_categories('unknown').fillna('unknown'))

    
    # Impute, clean and categorize host_acceptance_rate
    df.host_acceptance_rate = df.host_acceptance_rate.str.replace('%','')
    df.host_acceptance_rate = df.host_acceptance_rate.astype('float64')
    # Create Categories
    bins = [0, 50, 90, 99, 100]
    labels = ['0-49%', '50-89%', '90-99%', '100%']
    df['host_acceptance_rate'] = pd.cut(df['host_acceptance_rate'],
                                        bins=bins, 
                                        labels=labels, 
                                        include_lowest= True)
    # Create Unknown Category for 'NaN'
    df['host_acceptance_rate'] = (df['host_acceptance_rate'].cat.add_categories('unknown').fillna('unknown'))
    
    
    # Rename neighbourhood_cleansed
    df.rename(columns={'neighbourhood_cleansed': "neighbourhood"}, inplace=True)
    
    
    # Impute missing data in fees  
    df.security_deposit.fillna(0, inplace=True)
    df.cleaning_fee.fillna(0, inplace=True)

    # Change dtype
    df.security_deposit = df.security_deposit.astype('int64')
    df.cleaning_fee = df.cleaning_fee.astype('int64')
    
    
    # Imputate beds, bedrooms and bathrooms
    df.dropna(subset=['beds','bathrooms'], inplace=True)
    df[bedrooms].fillna((df[bedrooms].median()), inplace=True)
    # Convert continuous to discrete variable
    df.beds = df.beds.astype('int64')
    df.bathrooms = df.bathrooms.astype('int64')
    df.bedrooms = df.bedrooms.astype('int64')
    
    
    # Drop any remaining rows with nulls
    df.dropna(inplace=True)
    
    
    # listing_count
    df.host_listings_count = df.host_listings_count.astype('int64')
    
    
    # Boolean Columns
    df.host_is_superhost = df.host_is_superhost.astype('int64')
    df.host_identity_verified = df.host_identity_verified.astype('int64')
    
    
    # Regroup and rename property_type
    df.property_type.replace({'Apartment': 'apartment', 
                              'Hotel': 'hotel',
                              'House': 'house',
                              'Hostel': 'hostel',
                              'Condominium': 'apartment', 
                              'Aparthotel': 'apartment', 
                              'Boutique hotel': 'hotel',
                              'Villa': 'house', 
                              'Serviced apartment': 'apartment',
                              'Ryokan (Japan)': 'unique_space',
                              'Hut': 'house',
                              'Loft': 'apartment',
                              'Guesthouse': 'house',
                              'Bed and breakfast': 'house',
                              'Townhouse': 'house',
                              'Guest suite': 'apartment',
                              'Tiny house': 'house',
                              'Other': 'unique_space',
                              'Cabin': 'house',
                              'Bungalow': 'house', 
                              'Tent': 'unique_space',
                              'Camper/RV': 'unique_space',  
                              'Dome house': 'unique_space',
                              'Nature lodge': 'house',
                              'Resort':'house',     
                              'Dorm':'hostel', 
                              'Cottage':'house',
                              'Earth house':'house',
                              'Castle':'unique_space'
                             }, inplace=True)
    
    
    # Rename room_type
    listing.room_type.replace({'Entire home/apt': 'entire_home_apt',
                               'Private room': 'private_room',
                               'Hotel room': 'hotel_room',
                               'Shared room': 'shared_room',  
                              }, inplace=True)
    
    
    # Clear unneccessary characters in amenities
    df['amenities'] = df['amenities'].str.replace('{', '').str.replace('}','').str.replace('"', '')
    df['amenities'] = df['amenities'].str.lower()
    
    
    # Regroup Cancellation Policy
    df.cancellation_policy.replace({'strict_14_with_grace_period': 'strict',
                                    'super_strict_30': 'super_strict',
                                    'super_strict_60': 'super_strict',
                                    'strict_14_with_grace_period': 'strict',
                                    'luxury_moderate': 'moderate'
                                    }, inplace=True)
    
    
    ## FEATURE ENGINEER
    # Distance
    tokyo_center = (35.652832, 139.839478)
    shinjuku_center = (35.6938, 139.7034)
    toshima_center = (35.7263, 139.7168)
    taito_center = (35.7126, 139.7802)
    narita_airport = (35.7720, 140.3929)
    hanida_airport = (35.5494, 139.7798)

    df['tokyo_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *tokyo_center), axis=1)
    df['shinjuku_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *shinjuku_center), axis=1)
    df['toshima_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *toshima_center), axis=1)
    df['taito_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *taito_center), axis=1)
    df['narita_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *narita_airport), axis=1)
    df['hanida_dist'] = df.apply(lambda x: distance_to(x.latitude, x.longitude, *hanida_airport), axis=1)
    
    # get_dummies
    df = pd.get_dummies(df, columns=['property_type',
                                     'room_type',
                                     'cancellation_policy',
                                     'host_response_time',
                                     'host_acceptance_rate',
                                     'review_scores_rating',
                                     'review_scores_accuracy',
                                     'review_scores_cleanliness',
                                     'review_scores_checkin',
                                     'review_scores_communication',
                                     'review_scores_location',
                                     'review_scores_value',
                                     'instant_bookable',
                                     'is_location_exact'
                                    ], 
                                drop_first = False)
    
    
    # Amenities
    all_amenities = df['amenities'].str.get_dummies(sep=',')
    # Grouping similar amenities together
    df["24_hour_checkin"] = (all_amenities["24-hour check-in"])

    df["air_conditioning"] = (all_amenities["air conditioning"])

    df["entertainment"] = (all_amenities["cable tv"] + 
                               all_amenities["carbon monoxide alarm"] + 
                               all_amenities["dvd player"] + 
                               all_amenities["game console"] + 
                               all_amenities["netflix"] + 
                               all_amenities["projector and screen"] + 
                               all_amenities["smart tv"] + 
                               all_amenities["tv"])

    df["bbq"] = (all_amenities["barbecue utensils"] + 
                     all_amenities["bbq grill"])

    df["balcony"] = (all_amenities["balcony"] + 
                         all_amenities["patio or balcony"])

    df["nature_and_views"] = (all_amenities["beachfront"] + 
                                  all_amenities["garden or backyard"] + 
                                  all_amenities["lake access"] + 
                                  all_amenities["waterfront"])

    df["bedroom_essentials"] = (all_amenities["bed linens"] + 
                                    all_amenities["bedroom comforts"] + 
                                    all_amenities["extra pillows and blankets"] + 
                                    all_amenities["firm mattress"] + 
                                    all_amenities["memory foam mattress"] + 
                                    all_amenities["room-darkening shades"])

    df["bathroom_amenities"] = (all_amenities[" toilet"] + 
                                    all_amenities["handheld shower head"] + 
                                    all_amenities["heated towel rack"] + 
                                    all_amenities["hot water"])

    df["toiletties"] = (all_amenities["bath towel"] + 
                            all_amenities["bathroom essentials"] + 
                            all_amenities["body soap"] + 
                            all_amenities["essentials"] + 
                            all_amenities["shampoo"] + 
                            all_amenities["shower gel"] + 
                            all_amenities["toilet paper"])

    df["essentials_amenities"] = (all_amenities["bottled water"] + 
                                      all_amenities["first aid kit"] + 
                                      all_amenities["slippers"] + 
                                      all_amenities["trash can"])

    df["breakfast"] = (all_amenities["bread maker"] + 
                           all_amenities["breakfast"])

    df["coffee"] = (all_amenities["coffee maker"])

    df["kitchen_amenities"] = (all_amenities["baking sheet"] + 
                                   all_amenities["convection oven"] + 
                                   all_amenities["cooking basics"] + 
                                   all_amenities["dishes and silverware"] + 
                                   all_amenities["full kitchen"] + 
                                   all_amenities["gas oven"] + 
                                   all_amenities["hot water kettle"] + 
                                   all_amenities["kitchen"] + 
                                   all_amenities["kitchenette"] + 
                                   all_amenities["microwave"] + 
                                   all_amenities["oven"] + 
                                   all_amenities["refrigerator"] +
                                   all_amenities["stove"])

    df["white_goods"] = (all_amenities["dishwasher"] + 
                             all_amenities["dryer"])

    df["elevator"] = (all_amenities["elevator"])

    df["gym"] = (all_amenities["gym"])

    df["child_friendly"] = (all_amenities["baby bath"] + 
                                all_amenities["baby monitor"] + 
                                all_amenities["babysitter recommendations"] + 
                                all_amenities["changing table"] + 
                                all_amenities["children’s books and toys"] + 
                                all_amenities["children’s dinnerware"] + 
                                all_amenities["crib"] + 
                                all_amenities["family/kid friendly"] + 
                                all_amenities["high chair"] + 
                                all_amenities["pack ’n play/travel crib"] + 
                                all_amenities["table corner guards"])


    df["tubs_and_pools"] = (all_amenities["bathtub"] + 
                                all_amenities["bathtub with bath chair"] + 
                                all_amenities["hot tub"] + 
                                all_amenities["pool"] + 
                                all_amenities["pool with pool hoist"] + 
                                all_amenities["soaking tub"])

    df["internet"] = (all_amenities["ethernet connection"] + 
                          all_amenities["internet"] + 
                          all_amenities["pocket wifi"] + 
                          all_amenities["wifi"])

    df["long_term_stays"] = (all_amenities["long term stays allowed"])

    df["pets_friendly"] = (all_amenities["cat(s)"] + 
                               all_amenities["dog(s)"] + 
                               all_amenities["other pet(s)"] + 
                               all_amenities["pets allowed"] + 
                               all_amenities["pets live on this property"])

    df["safe_and_security"] = (all_amenities["fire extinguisher"] + 
                                   all_amenities["lock on bedroom door"] + 
                                   all_amenities["lockbox"] + 
                                   all_amenities["safety card"] + 
                                   all_amenities["smoke alarm"] + 
                                   all_amenities["smart lock"])

    df["self_check_in"] = (all_amenities["self check-in"])

    df["smoking_allowed"] = (all_amenities["smoking allowed"])

    df["accessibility"] = (all_amenities["disabled parking spot"] + 
                               all_amenities["fixed grab bars for shower"] + 
                               all_amenities["fixed grab bars for toilet"] +
                               all_amenities["no stairs or steps to enter"] + 
                               all_amenities["roll-in shower"] + 
                               all_amenities["roll-in shower with chair"] + 
                               all_amenities["wheelchair accessible"])

    df["parking_spaaces"] = (all_amenities["free parking on premises"] + 
                                 all_amenities["free street parking"]+ 
                                 all_amenities["paid parking off premises"] + 
                                 all_amenities["paid parking on premises"])

    df["hospitality_services"] = (all_amenities["building staff"] + 
                                      all_amenities["doorman"] + 
                                      all_amenities["luggage dropoff allowed"] + 
                                      all_amenities["host greets you"])

    df["laptop-friendly workspace"] = (all_amenities["laptop-friendly workspace"])


    df["heating_amenities"] = (all_amenities["heated floors"] + 
                                   all_amenities["heating"])


    df["other_electronics"] = (all_amenities["iron"] + 
                                   all_amenities["hair dryer"] + 
                                   all_amenities["air purifier"])


    df["suitable for events"] = (all_amenities["suitable for events"])

    return df