# Modeling

This script will read in the training and test data, create modeling infrustructure via pipelines, and finally train models to be saved to disk.

The models will be regression focused including:
1. Median (baseline)
2. General Regression with PCA
3. Lasso Regression
4. 

In [85]:
# Your imports
import os

%matplotlib inline
import string
import sys
from collections import deque

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb

# data
from sklearn.compose import ColumnTransformer, make_column_transformer

# Dummy Regressor
from sklearn.dummy import DummyRegressor

# models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# other
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor, export_graphviz
from sklearn.impute import SimpleImputer

# Stats
from scipy.stats import lognorm, loguniform, randint, uniform

# Shap
import shap

# Metrics
from sklearn.metrics import plot_roc_curve, plot_confusion_matrix, classification_report, accuracy_score, confusion_matrix, f1_score

# Feature Selection
from sklearn.feature_selection import RFE, RFECV

# Unlimited columns
pd.options.display.max_columns = None

## Load in the Data

In [6]:
# Import Data
training_df = pd.read_csv("../data/ready_for_modeling/training_data.csv")
test_df = pd.read_csv("../data/ready_for_modeling/test_data.csv")

Columns (20) have mixed types.Specify dtype option on import or set low_memory=False.


In [47]:
# Get on listing basis
training_df = training_df.groupby('listing_id').mean().reset_index()
test_df = test_df.groupby('listing_id').mean().reset_index()

## Quick EDA

In [48]:
training_df.shape

(4025, 213)

In [49]:
training_df.index.nunique()

4025

In [89]:
training_df.head()

Unnamed: 0,listing_id,price,review_count,word_clean_use,word_place_use,word_stay_use,word_would_use,word_nice_use,word_great_use,word_easy_use,word_host_use,word_location_use,word_everything_use,word_comfortable_use,word_recommend_use,word_room_use,positive_score_mean,negative_score_mean,neutral_score_mean,average_comment_length,review_frequency,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,week_of_year_1,week_of_year_2,week_of_year_3,week_of_year_4,week_of_year_5,week_of_year_6,week_of_year_7,week_of_year_8,week_of_year_9,week_of_year_10,week_of_year_11,week_of_year_12,week_of_year_13,week_of_year_14,week_of_year_15,week_of_year_16,week_of_year_17,week_of_year_18,week_of_year_19,week_of_year_20,week_of_year_21,week_of_year_22,week_of_year_23,week_of_year_24,week_of_year_25,week_of_year_26,week_of_year_27,week_of_year_28,week_of_year_29,week_of_year_30,week_of_year_31,week_of_year_32,week_of_year_33,week_of_year_34,week_of_year_35,week_of_year_36,week_of_year_37,week_of_year_38,week_of_year_39,week_of_year_40,week_of_year_41,week_of_year_42,week_of_year_43,week_of_year_44,week_of_year_45,week_of_year_46,week_of_year_47,week_of_year_48,week_of_year_49,week_of_year_50,week_of_year_51,week_of_year_52,month_of_year_1,month_of_year_2,month_of_year_3,month_of_year_4,month_of_year_5,month_of_year_6,month_of_year_7,month_of_year_8,month_of_year_9,month_of_year_10,month_of_year_11,month_of_year_12,year_2021,year_2022,holiday_False,holiday_True,available,minimum_nights_x,maximum_nights_x,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,accommodates,bedrooms,beds,listing_price,minimum_nights_y,maximum_nights_y,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,desc_apartment,desc_located,desc_space,desc_home,desc_bed,desc_room,desc_kitchen,desc_access,desc_one,desc_private,desc_san,desc_francisco,desc_bathroom,desc_bedroom,desc_living,host_in_sf,host_verifications_email,host_verifications_facebook,host_verifications_google,host_verifications_government_id,host_verifications_identity_manual,host_verifications_jumio,host_verifications_kba,host_verifications_manual_offline,host_verifications_manual_online,host_verifications_offline_government_id,host_verifications_phone,host_verifications_reviews,host_verifications_selfie,host_verifications_sent_id,host_verifications_work_email,host_verifications_zhima_selfie,bathroom_private,bathroom_shared,bathroom_half,bathroom_count,amenities_Wifi,amenities_Smoke alarm,amenities_Essentials,amenities_Heating,amenities_Hangers,amenities_Carbon monoxide alarm,amenities_Hair dryer,amenities_Iron,amenities_Long term stays allowed,amenities_Kitchen,amenities_Shampoo,amenities_Dedicated workspace,amenities_Hot water,amenities_Washer,amenities_Fire extinguisher,amenities_Dryer,amenities_Coffee maker,amenities_Refrigerator,amenities_Microwave,amenities_Dishes and silverware,amenities_Bed linens,amenities_TV,amenities_Cooking basics,amenities_First aid kit,amenities_Private entrance,amenities_Free street parking,amenities_Oven,amenities_Stove,amenities_Extra pillows and blankets,amenities_Dishwasher,review_span,t_since_last_review,t_as_host,has_license
0,958,165.131507,286,0.475524,0.129371,0.15035,0.199301,0.206294,0.562937,0.171329,0.304196,0.195804,0.199301,0.013986,0.493007,0.48951,0.472266,0.015028,0.50921,30.727273,0.109915,162.192308,162.980769,163.153846,163.538462,170.096154,171.169811,162.673077,168.428571,172.0,172.0,173.428571,166.0,174.285714,176.0,173.571429,163.142857,161.428571,161.428571,161.428571,162.285714,162.0,161.285714,162.285714,162.428571,162.428571,162.285714,163.0,163.285714,163.142857,163.0,164.428571,164.285714,165.714286,167.714286,167.428571,171.285714,172.857143,172.857143,172.285714,171.428571,169.857143,168.142857,166.142857,164.857143,163.857143,163.142857,162.714286,162.285714,161.714286,161.428571,161.142857,160.857143,160.571429,160.285714,160.5,161.428571,161.428571,160.0,156.285714,169.0,172.107143,161.548387,162.4,162.483871,163.733333,169.903226,170.741935,165.433333,162.225806,160.666667,161.741935,161.857143,165.403561,165.028818,167.111111,0.345205,2.0,1125.0,1169,1.0,0.92,1.0,1.0,1.0,1.0,1.0,,37.77028,-122.43317,3,1.0,2.0,161.0,2,30,2.0,2.0,1125.0,1125.0,2.0,1125.0,1,0,5,16,126,314,43,2,4.87,4.94,4.95,4.95,4.9,4.98,4.78,0,1,1,0,0,3.6,2,2,0,2,0,0,1,1,0,1,0,0,2,0,1,1,1,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0.0,0.0,0.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0,2602.0,188.0,5047.0,1
1,5858,235.0,106,0.377358,0.075472,0.103774,0.103774,0.216981,0.386792,0.113208,0.235849,0.216981,0.084906,0.179245,0.179245,0.471698,0.492509,0.009821,0.497679,31.537736,0.050404,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,235.0,1.0,30.0,60.0,8904,1.0,0.68,0.0,2.0,2.0,1.0,1.0,,37.74474,-122.42089,5,2.0,3.0,235.0,30,60,30.0,30.0,60.0,60.0,30.0,60.0,1,30,60,90,365,111,0,0,4.88,4.85,4.87,4.89,4.85,4.77,4.68,0,1,1,0,0,0.76,1,0,0,0,0,0,0,2,0,0,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0.0,0.0,0.0,1.0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,2103.0,2463.0,4833.0,0
2,7918,62.065753,17,0.588235,0.470588,0.0,0.235294,0.117647,0.588235,0.0,0.058824,0.352941,0.117647,0.529412,0.294118,0.352941,0.391647,0.056176,0.552059,30.470588,0.006186,62.057692,62.057692,62.057692,62.057692,62.057692,62.113208,62.057692,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,58.571429,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,62.75,56.0,56.0,56.0,56.0,56.0,56.0,56.0,64.7,65.0,65.0,65.0,65.0,65.0,65.0,65.0,56.870968,56.0,62.569733,62.069164,62.0,1.0,32.0,60.0,21994,1.0,1.0,0.0,10.0,10.0,1.0,1.0,,37.76555,-122.45213,2,1.0,1.0,56.0,32,60,32.0,32.0,60.0,60.0,32.0,60.0,1,30,60,90,365,19,0,0,4.2,3.73,3.87,4.67,4.6,4.73,4.0,0,9,0,9,0,0.17,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0.0,1.0,0.0,4.0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,2748.0,811.0,4726.0,0
3,8142,62.065753,6,0.5,0.333333,0.166667,0.166667,0.166667,0.666667,0.166667,0.333333,0.5,0.333333,0.0,0.166667,0.5,0.4515,0.027,0.521667,25.0,0.006719,62.057692,62.057692,62.057692,62.057692,62.057692,62.113208,62.057692,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,56.0,58.571429,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,65.0,62.75,56.0,56.0,56.0,56.0,56.0,56.0,56.0,64.7,65.0,65.0,65.0,65.0,65.0,65.0,65.0,56.870968,56.0,62.569733,62.069164,62.0,1.0,32.0,90.0,21994,1.0,1.0,0.0,10.0,10.0,1.0,1.0,,37.76555,-122.45213,2,1.0,1.0,56.0,32,90,32.0,32.0,90.0,90.0,32.0,90.0,1,30,60,90,365,8,0,0,4.63,4.38,4.38,4.75,4.75,4.63,4.63,0,9,0,9,0,0.1,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0.0,1.0,0.0,4.0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,893.0,1748.0,4726.0,0
4,8339,895.0,28,0.535714,0.25,0.25,0.071429,0.428571,0.678571,0.321429,0.285714,0.535714,0.214286,0.178571,0.321429,0.642857,0.441857,0.022679,0.535429,60.607143,0.007859,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,895.0,0.99726,7.0,111.0,24215,1.0,0.0,0.0,2.0,2.0,1.0,1.0,,37.77564,-122.43642,4,2.0,2.0,895.0,7,111,7.0,7.0,111.0,111.0,7.0,111.0,1,29,59,89,364,28,0,0,4.87,4.88,5.0,4.94,5.0,4.94,4.75,0,2,2,0,0,0.19,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0.0,0.0,0.0,1.5,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,3563.0,1063.0,4711.0,1


In [51]:
test_df.shape

(1006, 213)

In [52]:
test_df.index.nunique()

1006

In [90]:
test_df.head()

Unnamed: 0,listing_id,price,review_count,word_clean_use,word_place_use,word_stay_use,word_would_use,word_nice_use,word_great_use,word_easy_use,word_host_use,word_location_use,word_everything_use,word_comfortable_use,word_recommend_use,word_room_use,positive_score_mean,negative_score_mean,neutral_score_mean,average_comment_length,review_frequency,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,week_of_year_1,week_of_year_2,week_of_year_3,week_of_year_4,week_of_year_5,week_of_year_6,week_of_year_7,week_of_year_8,week_of_year_9,week_of_year_10,week_of_year_11,week_of_year_12,week_of_year_13,week_of_year_14,week_of_year_15,week_of_year_16,week_of_year_17,week_of_year_18,week_of_year_19,week_of_year_20,week_of_year_21,week_of_year_22,week_of_year_23,week_of_year_24,week_of_year_25,week_of_year_26,week_of_year_27,week_of_year_28,week_of_year_29,week_of_year_30,week_of_year_31,week_of_year_32,week_of_year_33,week_of_year_34,week_of_year_35,week_of_year_36,week_of_year_37,week_of_year_38,week_of_year_39,week_of_year_40,week_of_year_41,week_of_year_42,week_of_year_43,week_of_year_44,week_of_year_45,week_of_year_46,week_of_year_47,week_of_year_48,week_of_year_49,week_of_year_50,week_of_year_51,week_of_year_52,month_of_year_1,month_of_year_2,month_of_year_3,month_of_year_4,month_of_year_5,month_of_year_6,month_of_year_7,month_of_year_8,month_of_year_9,month_of_year_10,month_of_year_11,month_of_year_12,year_2021,year_2022,holiday_False,holiday_True,available,minimum_nights_x,maximum_nights_x,host_id,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_group_cleansed,latitude,longitude,accommodates,bedrooms,beds,listing_price,minimum_nights_y,maximum_nights_y,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,desc_apartment,desc_located,desc_space,desc_home,desc_bed,desc_room,desc_kitchen,desc_access,desc_one,desc_private,desc_san,desc_francisco,desc_bathroom,desc_bedroom,desc_living,host_in_sf,host_verifications_email,host_verifications_facebook,host_verifications_google,host_verifications_government_id,host_verifications_identity_manual,host_verifications_jumio,host_verifications_kba,host_verifications_manual_offline,host_verifications_manual_online,host_verifications_offline_government_id,host_verifications_phone,host_verifications_reviews,host_verifications_selfie,host_verifications_sent_id,host_verifications_work_email,host_verifications_zhima_selfie,bathroom_private,bathroom_shared,bathroom_half,bathroom_count,amenities_Wifi,amenities_Smoke alarm,amenities_Essentials,amenities_Heating,amenities_Hangers,amenities_Carbon monoxide alarm,amenities_Hair dryer,amenities_Iron,amenities_Long term stays allowed,amenities_Kitchen,amenities_Shampoo,amenities_Dedicated workspace,amenities_Hot water,amenities_Washer,amenities_Fire extinguisher,amenities_Dryer,amenities_Coffee maker,amenities_Refrigerator,amenities_Microwave,amenities_Dishes and silverware,amenities_Bed linens,amenities_TV,amenities_Cooking basics,amenities_First aid kit,amenities_Private entrance,amenities_Free street parking,amenities_Oven,amenities_Stove,amenities_Extra pillows and blankets,amenities_Dishwasher,review_span,t_since_last_review,t_as_host,has_license
0,24723,189.876712,323,0.349845,0.19195,0.160991,0.182663,0.247678,0.560372,0.25387,0.287926,0.219814,0.213622,0.027864,0.52322,0.4613,0.460347,0.016907,0.522762,34.405573,0.07997,189.865385,189.884615,189.884615,189.884615,189.884615,189.867925,189.865385,189.0,189.0,189.0,189.0,189.857143,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,189.75,189.0,189.285714,190.0,189.714286,189.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,190.0,189.548387,189.5,189.908012,189.876081,189.888889,0.087671,4.0,1125.0,100800,1.0,1.0,1.0,3.0,3.0,1.0,1.0,,37.78897,-122.43376,4,1.0,1.0,189.0,4,31,4.0,4.0,1125.0,1125.0,4.0,1125.0,1,1,16,32,32,359,27,5,4.82,4.87,4.91,4.91,4.93,4.9,4.81,1,2,2,0,0,2.66,0,1,0,4,5,1,0,0,2,1,3,0,2,1,3,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0.0,0.0,0.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,0,1,1,1,0,0,0,0,1,4039.0,177.0,4441.0,1
1,25094,160.0,49,0.285714,0.122449,0.122449,0.163265,0.22449,0.367347,0.142857,0.163265,0.326531,0.204082,0.183673,0.204082,0.632653,0.473388,0.012592,0.514061,38.816327,0.056845,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,160.0,0.967123,5.0,45.0,103339,1.0,0.94,1.0,1.0,1.0,1.0,1.0,,37.74159,-122.43159,2,1.0,1.0,160.0,5,45,5.0,5.0,45.0,45.0,5.0,45.0,1,18,48,78,353,50,8,1,4.96,5.0,5.0,4.98,5.0,4.98,4.9,0,1,0,1,0,1.68,0,0,0,0,1,1,0,0,0,0,0,1,0,2,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1.0,0.0,0.0,1.0,1,1,0,1,0,1,1,1,0,0,0,0,0,1,1,1,1,0,0,0,0,1,0,0,0,1,0,0,0,1,862.0,205.0,4435.0,1
2,32414,150.0,89,0.460674,0.314607,0.224719,0.280899,0.258427,0.483146,0.303371,0.370787,0.325843,0.247191,0.022472,0.202247,0.573034,0.431022,0.021933,0.547079,42.022472,0.044257,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,0.342466,30.0,30.0,140251,,,0.0,1.0,1.0,1.0,0.0,,37.74038,-122.41327,2,1.0,2.0,150.0,30,30,30.0,30.0,30.0,30.0,30.0,30.0,1,0,0,0,125,94,0,0,4.81,4.94,4.96,4.96,4.94,4.48,4.61,0,1,1,0,0,1.0,1,0,0,0,0,2,0,0,0,2,0,0,0,2,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0.0,0.0,0.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,2011.0,970.0,4371.0,1
3,39418,78.868493,21,0.47619,0.333333,0.142857,0.190476,0.428571,0.428571,0.190476,0.142857,0.190476,0.238095,0.0,0.142857,0.571429,0.429,0.017238,0.553714,34.714286,0.013419,78.788462,78.923077,78.923077,78.923077,78.923077,78.811321,78.788462,73.0,73.0,73.0,73.0,79.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,78.25,73.0,73.0,73.0,73.0,73.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,73.677419,73.0,79.356083,78.870317,78.833333,0.090411,60.0,730.0,169184,,0.0,0.0,2.0,2.0,1.0,1.0,,37.741,-122.42018,2,,1.0,73.0,60,730,60.0,60.0,730.0,730.0,60.0,730.0,1,0,0,0,33,22,0,0,4.9,4.77,4.82,4.82,4.91,4.86,4.67,0,2,2,0,0,0.27,1,2,0,0,2,3,3,0,0,1,1,0,0,4,1,1,1,0,0,1,0,1,0,0,0,1,1,1,0,0,0,0,0.0,0.0,0.0,1.0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1565.0,1010.0,4331.0,0
4,41172,125.0,238,0.546218,0.436975,0.12605,0.231092,0.222689,0.504202,0.277311,0.176471,0.285714,0.159664,0.344538,0.130252,0.457983,0.472189,0.012962,0.514819,38.382353,0.064151,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0,0.923288,3.0,30.0,53314,,,1.0,1.0,1.0,1.0,1.0,,37.74132,-122.41818,2,1.0,1.0,125.0,3,30,3.0,3.0,30.0,30.0,3.0,30.0,1,22,46,62,337,277,14,2,4.9,4.93,4.84,4.96,4.97,4.85,4.84,0,1,0,1,0,2.24,0,0,1,0,1,1,1,0,0,0,0,2,0,1,1,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1.0,0.0,0.0,1.0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,3710.0,180.0,4579.0,1


## Processing and Transformations

In [156]:
X_train[numeric_features + binary_features].head()

Unnamed: 0,review_count,word_clean_use,word_place_use,word_stay_use,word_would_use,word_nice_use,word_great_use,word_easy_use,word_host_use,word_location_use,word_everything_use,word_comfortable_use,word_recommend_use,word_room_use,positive_score_mean,negative_score_mean,neutral_score_mean,average_comment_length,available,host_response_rate,host_acceptance_rate,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,desc_apartment,desc_located,desc_space,desc_home,desc_bed,desc_room,desc_kitchen,desc_access,desc_one,desc_private,desc_san,desc_francisco,desc_bathroom,desc_bedroom,desc_living,review_span,t_since_last_review,t_as_host,bathroom_private,bathroom_half,bathroom_count,bathroom_shared,host_is_superhost,host_has_profile_pic,host_identity_verified,has_availability,instant_bookable,host_in_sf,host_verifications_email,host_verifications_facebook,host_verifications_google,host_verifications_government_id,host_verifications_identity_manual,host_verifications_jumio,host_verifications_kba,host_verifications_manual_offline,host_verifications_manual_online,host_verifications_offline_government_id,host_verifications_phone,host_verifications_reviews,host_verifications_selfie,host_verifications_sent_id,host_verifications_work_email,host_verifications_zhima_selfie,amenities_Wifi,amenities_Smoke alarm,amenities_Essentials,amenities_Heating,amenities_Hangers,amenities_Carbon monoxide alarm,amenities_Hair dryer,amenities_Iron,amenities_Long term stays allowed,amenities_Kitchen,amenities_Shampoo,amenities_Dedicated workspace,amenities_Hot water,amenities_Washer,amenities_Fire extinguisher,amenities_Dryer,amenities_Coffee maker,amenities_Refrigerator,amenities_Microwave,amenities_Dishes and silverware,amenities_Bed linens,amenities_TV,amenities_Cooking basics,amenities_First aid kit,amenities_Private entrance,amenities_Free street parking,amenities_Oven,amenities_Stove,amenities_Extra pillows and blankets,amenities_Dishwasher,has_license
0,286,0.475524,0.129371,0.15035,0.199301,0.206294,0.562937,0.171329,0.304196,0.195804,0.199301,0.013986,0.493007,0.48951,0.472266,0.015028,0.50921,30.727273,0.345205,1.0,0.92,1.0,1.0,37.77028,-122.43317,3,1.0,2.0,2.0,1125.0,0,5,16,126,314,43,2,4.87,4.94,4.95,4.95,4.9,4.98,4.78,1,1,0,0,3.6,2,2,0,2,0,0,1,1,0,1,0,0,2,0,1,2602.0,188.0,5047.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1,0,1,1,1,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0,1
1,106,0.377358,0.075472,0.103774,0.103774,0.216981,0.386792,0.113208,0.235849,0.216981,0.084906,0.179245,0.179245,0.471698,0.492509,0.009821,0.497679,31.537736,1.0,1.0,0.68,2.0,2.0,37.74474,-122.42089,5,2.0,3.0,30.0,60.0,30,60,90,365,111,0,0,4.88,4.85,4.87,4.89,4.85,4.77,4.68,1,1,0,0,0.76,1,0,0,0,0,0,0,2,0,0,1,0,1,0,1,2103.0,2463.0,4833.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
2,17,0.588235,0.470588,0.0,0.235294,0.117647,0.588235,0.0,0.058824,0.352941,0.117647,0.529412,0.294118,0.352941,0.391647,0.056176,0.552059,30.470588,1.0,1.0,1.0,10.0,10.0,37.76555,-122.45213,2,1.0,1.0,32.0,60.0,30,60,90,365,19,0,0,4.2,3.73,3.87,4.67,4.6,4.73,4.0,9,0,9,0,0.17,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,2748.0,811.0,4726.0,0.0,0.0,4.0,1.0,0.0,1.0,1.0,1,0,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,1,0,0,1,1,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
3,6,0.5,0.333333,0.166667,0.166667,0.166667,0.666667,0.166667,0.333333,0.5,0.333333,0.0,0.166667,0.5,0.4515,0.027,0.521667,25.0,1.0,1.0,1.0,10.0,10.0,37.76555,-122.45213,2,1.0,1.0,32.0,90.0,30,60,90,365,8,0,0,4.63,4.38,4.38,4.75,4.75,4.63,4.63,9,0,9,0,0.1,0,1,0,1,0,0,0,1,0,1,0,3,6,0,1,893.0,1748.0,4726.0,0.0,0.0,4.0,1.0,0.0,1.0,1.0,1,0,1,1,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,1,1,0,1,0,1,0,0,1,1,0,0,1,1,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0
4,28,0.535714,0.25,0.25,0.071429,0.428571,0.678571,0.321429,0.285714,0.535714,0.214286,0.178571,0.321429,0.642857,0.441857,0.022679,0.535429,60.607143,0.99726,1.0,0.0,2.0,2.0,37.77564,-122.43642,4,2.0,2.0,7.0,111.0,29,59,89,364,28,0,0,4.87,4.88,5.0,4.94,5.0,4.94,4.75,2,2,0,0,0.19,0,1,0,0,0,1,0,0,0,1,0,0,0,1,1,3563.0,1063.0,4711.0,0.0,0.0,1.5,0.0,0.0,1.0,1.0,1,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1


In [161]:
# Numeric Columns
numeric_features = ["review_count", "word_clean_use", "word_place_use", "word_stay_use", "word_would_use", "word_nice_use", "word_great_use", "word_easy_use",
                    "word_host_use", "word_location_use", "word_everything_use", "word_comfortable_use", "word_recommend_use", "word_room_use",
                    "positive_score_mean", "negative_score_mean", "neutral_score_mean", "average_comment_length", "available",
                    "host_listings_count", "host_total_listings_count", "latitude",
                    "longitude", "accommodates", "bedrooms", "beds", "minimum_nights_avg_ntm", "maximum_nights_avg_ntm",
                    "availability_30", "availability_60", "availability_90", "availability_365", "number_of_reviews", "number_of_reviews_ltm",
                    "number_of_reviews_l30d", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
                    "review_scores_location", "review_scores_value", "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
                    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms", "reviews_per_month", "desc_apartment",
                    "desc_located", "desc_space", "desc_home", "desc_bed", "desc_room", "desc_kitchen",
                   "desc_access", "desc_one", "desc_private", "desc_san", "desc_francisco", "desc_bathroom",
                   "desc_bedroom", "desc_living", "review_span", "t_since_last_review", "t_as_host",
                   "bathroom_private", "bathroom_half", "bathroom_count", "bathroom_shared"]

# Binary Columns
binary_features = ["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "has_availability", "instant_bookable", "host_in_sf",
                   "host_verifications_email", "host_verifications_facebook", "host_verifications_google", "host_verifications_government_id", 
                   "host_verifications_identity_manual", "host_verifications_jumio", "host_verifications_kba", "host_verifications_manual_offline", 
                   "host_verifications_manual_online", "host_verifications_offline_government_id", "host_verifications_phone", "host_verifications_reviews",
                   "host_verifications_selfie", "host_verifications_sent_id", "host_verifications_work_email", "host_verifications_zhima_selfie",
                   "amenities_Wifi", "amenities_Smoke alarm", "amenities_Essentials", "amenities_Heating", "amenities_Hangers", "amenities_Carbon monoxide alarm",
                   "amenities_Hair dryer", "amenities_Iron", "amenities_Long term stays allowed", "amenities_Kitchen", "amenities_Shampoo", "amenities_Dedicated workspace",
                   "amenities_Hot water", "amenities_Washer", "amenities_Fire extinguisher", "amenities_Dryer", "amenities_Coffee maker", "amenities_Refrigerator",
                   "amenities_Microwave", "amenities_Dishes and silverware", "amenities_Bed linens", "amenities_TV", "amenities_Cooking basics", "amenities_First aid kit",
                   "amenities_Private entrance", "amenities_Free street parking", "amenities_Oven", "amenities_Stove", "amenities_Extra pillows and blankets", "amenities_Dishwasher",
                   "has_license"]

# Drop Columns
drop_features = ['day_of_week_0', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'week_of_year_1', 'week_of_year_2', 
                 'week_of_year_3', 'week_of_year_4', 'week_of_year_5', 'week_of_year_6', 'week_of_year_7', 'week_of_year_8', 'week_of_year_9', 'week_of_year_10', 'week_of_year_11',	
                 'week_of_year_12', 'week_of_year_13', 'week_of_year_14', 'week_of_year_15', 'week_of_year_16', 'week_of_year_17', 'week_of_year_18', 'week_of_year_19',	
                 'week_of_year_20', 'week_of_year_21', 'week_of_year_22', 'week_of_year_23', 'week_of_year_24', 'week_of_year_25', 'week_of_year_26', 'week_of_year_27',	
                 'week_of_year_28', 'week_of_year_29', 'week_of_year_30', 'week_of_year_31', 'week_of_year_32', 'week_of_year_33', 'week_of_year_34', 'week_of_year_35',	
                 'week_of_year_36', 'week_of_year_37', 'week_of_year_38', 'week_of_year_39', 'week_of_year_40', 'week_of_year_41', 'week_of_year_42', 'week_of_year_43',	
                 'week_of_year_44', 'week_of_year_45', 'week_of_year_46', 'week_of_year_47', 'week_of_year_48', 'week_of_year_49', 'week_of_year_50', 'week_of_year_51',	
                 'week_of_year_52', 'month_of_year_1', 'month_of_year_2', 'month_of_year_3', 'month_of_year_4', 'month_of_year_5', 'month_of_year_6', 'month_of_year_7',	
                 'month_of_year_8', 'month_of_year_9', 'month_of_year_10', 'month_of_year_11', 'month_of_year_12', 'year_2021', 'year_2022', 
                 'holiday_False', 'holiday_True', 'minimum_nights_x', 'maximum_nights_x',  'minimum_nights_y', 'maximum_nights_y', 'host_id', 'listing_price',
                 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', #'week_of_year_11',
                 #'week_of_year_12', 'week_of_year_19', 'week_of_year_20', 'week_of_year_27', 'week_of_year_28', 'week_of_year_35',
                 #'week_of_year_36', 'week_of_year_43', 'week_of_year_44', 'week_of_year_51', 'week_of_year_52', 'month_of_year_7', 'month_of_year_8',
                 'neighbourhood_group_cleansed', 'listing_id', "review_frequency", 'host_response_rate', 'host_acceptance_rate']

# Target Columns
target_features = ['price']

# Check to make sure all columns included
assert(len(set(numeric_features + binary_features + drop_features + target_features)) == training_df.shape[1])

In [162]:
training_df[binary_features] = training_df[binary_features].apply(pd.to_numeric, errors='ignore', downcast='integer')

In [189]:
# Define Pipelines
numeric_transformer = make_pipeline(SimpleImputer(strategy='median'), 
                                    StandardScaler())

binary_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'))

# Define Preprocessor
preprocessor = make_column_transformer(
    ("drop", drop_features),
    (numeric_transformer, numeric_features),
    (binary_transformer, binary_features),
)

In [190]:
# Define X and y
X_train = training_df.drop("price", axis = 1)
y_train = training_df['price']

X_test = test_df.drop("price", axis = 1)
y_test = test_df['price']

In [205]:
# Borrowed from lecture 5!  (Code written not by me)
def mean_std_cross_val_scores(model, 
                              X_train, 
                              y_train, 
                              scoring_metric = "r2"):
    """
    Returns mean and std of cross validation.  
    """
    scores = cross_validate(model, 
                            X_train, 
                            y_train, 
                            return_train_score=True,                                         
                            scoring=scoring_metric,
                            cv=5)
    
    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):  
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data = out_col, index = mean_scores.index)

In [206]:
results = {}

In [207]:
dummy_pipeline = make_pipeline(preprocessor, DummyRegressor(strategy="median"))
results['DummyRegressor'] = mean_std_cross_val_scores(dummy_pipeline, X_train, y_train, scoring_metric=['r2', 'neg_median_absolute_error'])
pd.DataFrame(results)

Unnamed: 0,DummyRegressor
fit_time,0.047 (+/- 0.002)
score_time,0.009 (+/- 0.000)
test_r2,-0.072 (+/- 0.037)
train_r2,-0.039 (+/- 0.014)
test_neg_median_absolute_error,-61.962 (+/- 5.892)
train_neg_median_absolute_error,-61.419 (+/- 1.708)


In [211]:
models = {
    "Ridge": Ridge(max_iter=10_000),
    "Lasso": Lasso(max_iter=10_000),
    "Elastic Net": ElasticNet(max_iter=10_000)
}

In [212]:
for model in models:
    pipe = make_pipeline(preprocessor, models[model])
    results[model] = mean_std_cross_val_scores(pipe, X_train, y_train, scoring_metric=['r2', 'neg_median_absolute_error'])

In [213]:
pd.DataFrame(results).T

Unnamed: 0,fit_time,score_time,test_r2,train_r2,test_neg_median_absolute_error,train_neg_median_absolute_error
DummyRegressor,0.047 (+/- 0.002),0.009 (+/- 0.000),-0.072 (+/- 0.037),-0.039 (+/- 0.014),-61.962 (+/- 5.892),-61.419 (+/- 1.708)
Ridge,0.067 (+/- 0.008),0.014 (+/- 0.001),-0.075 (+/- 0.120),0.407 (+/- 0.238),-108.180 (+/- 17.480),-97.815 (+/- 16.394)
Lasso,1.592 (+/- 0.883),0.014 (+/- 0.000),-0.035 (+/- 0.083),0.402 (+/- 0.240),-99.497 (+/- 13.693),-90.547 (+/- 16.115)
Elastic Net,0.076 (+/- 0.000),0.014 (+/- 0.000),-0.026 (+/- 0.082),0.277 (+/- 0.149),-94.481 (+/- 15.123),-89.032 (+/- 12.828)
