In [1]:
import numpy as np
import pandas as pd
import ast
import math
# sklearn tools
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

from sklearn import metrics, impute
from sklearn.impute import KNNImputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

test.head()

Unnamed: 0,id,listing_location,description,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_neighbourhood,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,917611,chicago,Solo Hyde Park visitors are invited stay in th...,2008-08-29,,My apartment is a 2nd floor walk-up in a vinta...,within an hour,100%,81%,Hyde Park,...,4.99,4.98,4.95,4.94,False,1,0,1,0,2.02
1,329273,chicago,A very small studio in a wonderful neighborhood.,2009-05-19,"Chicago, IL",We live in Chicago. We love to travel and see ...,within an hour,100%,100%,Ukrainian Village,...,4.91,4.88,4.9,4.76,False,1,1,0,0,2.98
2,133046,chicago,Beautiful first floor apartment in Historic Ol...,2009-08-21,"Chicago, IL","Hi, we're Bob and Liz Biddle, long time Chicag...",within an hour,100%,97%,Old Town,...,4.83,4.87,4.97,4.72,True,6,6,0,0,0.88
3,960346,chicago,Located on a peaceful treelined street in ener...,2009-07-25,Italy,i've owned the building since 1993 and been ho...,within a few hours,100%,89%,Bucktown,...,4.94,4.88,4.93,4.72,False,1,1,0,0,1.4
4,386102,chicago,We offer the highest standards of cleanliness....,2011-07-31,"Chicago, IL","I have a small family (partner, Dave and daugh...",within an hour,100%,100%,Logan Square,...,4.97,4.95,4.91,4.88,False,1,1,0,0,4.19


In [3]:
# cols to DROP
    # has_availability -- all True
    # host_neighborhood - idk seems irrelevant
    # neighbourhood_cleansed
    # first_review, last_review
    # room_type
    # 

cols_to_drop = ['has_availability', 'host_neighbourhood', 'neighbourhood_cleansed', 'first_review',
               'last_review', 'room_type', 'host_response_time', 'host_location', 'host_about']

train = train.drop(columns = cols_to_drop)
test = test.drop(columns=cols_to_drop)

In [4]:
def clean_cols(val):
    if isinstance(val, str):
        val = val.replace('$', '').replace('%', '').replace(',', '')
    return float(val)

train[['host_acceptance_rate','host_response_rate']] = train[['host_acceptance_rate','host_response_rate']].map(clean_cols)
test[['host_acceptance_rate','host_response_rate']] = test[['host_acceptance_rate','host_response_rate']].map(clean_cols)

In [5]:
# clean bathrooms_text - convert half-bath to 0.5
train['bathrooms_text'] = train['bathrooms_text'].replace('Half-bath', '0.5')
test['bathrooms_text'] = test['bathrooms_text'].replace('Half-bath', '0.5')

# extract numeric part
train['bathrooms_text'] = train['bathrooms_text'].str.extract(r'(\d+\.?\d*)')[0].astype(float)
test['bathrooms_text'] = test['bathrooms_text'].str.extract(r'(\d+\.?\d*)')[0].astype(float)

test = test.rename(columns={'bathrooms_text': 'bathrooms'})
train = train.rename(columns={'bathrooms_text': 'bathrooms'})

In [6]:
# host identity verified - convert to 0/1
# instant bookable - convert to 0/1
# host profile pic - convert to 0/1

train['instant_bookable'] = train['instant_bookable'].fillna(False).astype(int)
test['instant_bookable'] = test['instant_bookable'].fillna(False).astype(int)

train['host_identity_verified'] = train['host_identity_verified'].fillna(False).astype(int)
test['host_identity_verified'] = test['host_identity_verified'].fillna(False).astype(int)

train['host_has_profile_pic'] = train['host_has_profile_pic'].fillna(False).astype(int)
test['host_has_profile_pic'] = test['host_has_profile_pic'].fillna(False).astype(int)

  train['host_identity_verified'] = train['host_identity_verified'].fillna(False).astype(int)
  test['host_identity_verified'] = test['host_identity_verified'].fillna(False).astype(int)
  train['host_has_profile_pic'] = train['host_has_profile_pic'].fillna(False).astype(int)
  test['host_has_profile_pic'] = test['host_has_profile_pic'].fillna(False).astype(int)


In [7]:
# property type -- make new col, 1 if entire property and 0 otherwise

train['is_entire_place'] = train['property_type'].str.contains('entire', case=False, na=False).astype(int)
test['is_entire_place'] = test['property_type'].str.contains('entire', case=False, na=False).astype(int)

train = train.drop('property_type', axis=1)
test = test.drop('property_type', axis=1)

In [8]:
# description -- make new col, 1 if contains 'luxury' words

keywords = ['luxury', 'luxurious', 'penthouse', 'exclusive', 'elegant',
            'premium', 'high-end', 'designer', 'upscale', 'chic',
            'modern', 'deluxe', 'sophisticated', 'breathtaking','custom-built', 'architect-designed',
            'state-of-the-art','prestigious', 'top-tier', '5-star', 'five-star']

pattern = '|'.join(keywords)
train['luxury_description'] = train['description'].str.contains(pattern, case=False, na=False).astype(int)
test['luxury_description'] = test['description'].str.contains(pattern, case=False, na=False).astype(int)

train = train.drop('description', axis=1)
test = test.drop('description', axis=1)

In [9]:
# OHE location

location_dummies = pd.get_dummies(train['listing_location'], prefix='location', drop_first=True)
train = pd.concat([train, location_dummies], axis=1).drop('listing_location', axis=1)

location_dummies = pd.get_dummies(test['listing_location'], prefix='location', drop_first=True)
test = pd.concat([test, location_dummies], axis=1).drop('listing_location', axis=1)

In [10]:
train[['location_chicago', 'location_kauai']] = train[['location_chicago', 'location_kauai']].astype(int)
test[['location_chicago', 'location_kauai']] = test[['location_chicago', 'location_kauai']].astype(int)

In [11]:
# host_verifications, amenities. -- convert to length

def make_len(val):
    if isinstance(val, float) and math.isnan(val):
        return np.nan
    if isinstance(val, str):
        try:
            val = ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return np.nan
    return len(val)
    

train['host_verifications'] = train['host_verifications'].map(make_len)
train['amenities'] = train['amenities'].map(make_len)

test['host_verifications'] = test['host_verifications'].map(make_len)
test['amenities'] = test['amenities'].map(make_len)

In [12]:
from datetime import datetime

today = pd.to_datetime("today")
train['host_since'] = pd.to_datetime(train['host_since'], errors='coerce')
train['host_since'] = (today - train['host_since']).dt.days // 365

test['host_since'] = pd.to_datetime(test['host_since'], errors='coerce')
test['host_since'] = (today - test['host_since']).dt.days // 365

In [13]:
train.dtypes

id                                                int64
host_since                                      float64
host_response_rate                              float64
host_acceptance_rate                            float64
host_is_superhost                                 int64
host_listings_count                             float64
host_total_listings_count                       float64
host_verifications                              float64
host_has_profile_pic                              int64
host_identity_verified                            int64
latitude                                        float64
longitude                                       float64
accommodates                                      int64
bathrooms                                       float64
bedrooms                                        float64
beds                                            float64
amenities                                         int64
minimum_nights                                  

In [14]:
# IMPUTE MISSING VALS

# fit to train data, transform train and test data

imputer = KNNImputer(n_neighbors=3)

train_impute_cols = [col for col in train.columns if col not in ['id', 'host_is_superhost']]

#scale
scaler = MinMaxScaler()
scaled_train_array = scaler.fit_transform(train[train_impute_cols])
scaled_train_df = pd.DataFrame(scaled_train_array, columns=train_impute_cols, index=train.index)

train_imputed_arr = imputer.fit_transform(scaled_train_df) # first scale
train_unscaled_data = scaler.inverse_transform(train_imputed_arr) # then undo scale

train_imputed = train.copy()
train_imputed[train_impute_cols] = pd.DataFrame(train_unscaled_data, columns=train_impute_cols, index=train.index)


# test - use same scale
test_impute_cols = [col for col in test.columns if col != 'id']


scaled_test_array = scaler.transform(test[test_impute_cols])
scaled_test_df = pd.DataFrame(scaled_test_array, columns=test_impute_cols, index=test.index)

test_imputed_arr = imputer.transform(scaled_test_df)
test_unscaled_data = scaler.inverse_transform(test_imputed_arr)

test_imputed = test.copy()
test_imputed[test_impute_cols] = pd.DataFrame(test_unscaled_data, columns = test_impute_cols, index=test.index)

In [38]:
# slice predictors - no id !! but save for predictions

X_train = train_imputed.drop(columns=['host_is_superhost','id'])
y_train = train_imputed['host_is_superhost']

X_test = test_imputed.drop(columns=['id'])
test_ids = test_imputed['id']

In [40]:
# scale

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)