# Importaciones

In [585]:
"""Módulo para crear un modelo predictivo de precios de Airbnb en la Comunidad de Madrid."""

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Descargar los datos

In [11]:
data_location = "https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz"

In [12]:
!wget https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz

--2024-06-01 19:46:38--  https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz
Resolving data.insideairbnb.com (data.insideairbnb.com)... 18.154.48.59, 18.154.48.87, 18.154.48.41, ...
Connecting to data.insideairbnb.com (data.insideairbnb.com)|18.154.48.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13297683 (13M) [application/x-gzip]
Saving to: ‘listings.csv.gz’


2024-06-01 19:46:39 (17.1 MB/s) - ‘listings.csv.gz’ saved [13297683/13297683]



In [558]:
# Reasoning behind which columns are removed, and which require text processing
columns = [
    'id',                     # Remove - Identifier
    'listing_url',            # Remove - URL with ID at end
    'scrape_id',              # Remove - details of scrape
    'last_scraped',           # Remove - detail of scrape
    'source',                 # Remove - detail of scrape
    'name',                   # Do text processing
    'description',            # Do text processing
    'neighborhood_overview',  # Do text processing
    'picture_url',            # Remove - URl
    'host_id',                # Could learn relation between host and price
    'host_url',               # Remove - redundant with host_id
    'host_name',              # Do text processing
    'host_since',             # Convert to number of months or years 
    'host_location',          # Do text processing
    'host_about',             # Do text processing
    'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost',
    'host_thumbnail_url',     # Remove
    'host_picture_url',       # Remove
    'host_neighbourhood',     # Do text processing
    'host_listings_count', 'host_total_listings_count',
    'host_verifications',     #
    'host_has_profile_pic', 'host_identity_verified',
    'neighbourhood',          #  Remove - redundant
    'neighbourhood_cleansed', #  Do text processing
    'neighbourhood_group_cleansed',
    'latitude', 'longitude',  # Consider removing, redundant with neighbourhood_cleansed
    'property_type',          #  Do text processing
    'room_type', 'accommodates', 'bathrooms',
    'bathrooms_text',         # Remove - largely redundant
    'bedrooms', 'beds', 'amenities',
    'price',                  # Dependent variable
    'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
    'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'calendar_updated',       # Remove - 0 not null
    'has_availability',
    'availability_30', 'availability_60', 'availability_90',
    'availability_365',
    'calendar_last_scraped',  # Remove - same across all rows
    'number_of_reviews',
    'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review',  # Convert to days ago
    'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location',
    'review_scores_value',
    'license',                # Remove 
    'instant_bookable',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms', 'reviews_per_month'   
]

data_types = {
    'price': pd.StringDtype(),
    'host_response_time': pd.StringDtype(),
    'host_response_rate': pd.StringDtype(),
    'host_acceptance_rate': pd.StringDtype(),
    'host_is_superhost': pd.StringDtype(),
    'host_has_profile_pic': pd.StringDtype(),
    'host_identity_verified': pd.StringDtype(),
    'room_type': pd.StringDtype(),
    'has_availability': pd.StringDtype(),
    'instant_bookable': pd.StringDtype(),
    'name': pd.StringDtype(),
    'description': pd.StringDtype(),
    'neighborhood_overview': pd.StringDtype(),
    'host_name': pd.StringDtype(),
    'host_location': pd.StringDtype(),
    'host_about': pd.StringDtype(),
    'host_neighbourhood': pd.StringDtype(),
    'neighbourhood_cleansed': pd.StringDtype(),
    'neighbourhood_group_cleansed': pd.StringDtype(),
    'property_type': pd.StringDtype(),
    'host_verifications': pd.StringDtype(),
    'amenities': pd.StringDtype(),
}

# Cargar los datos y crear subconjuntos

In [559]:
parse_dates = ['host_since','first_review','last_review']

data = pd.read_csv('listings.csv.gz', parse_dates=parse_dates, dtype=data_types)

# Select only data where there exists a price
data = data[data['price'].notnull()]
data.shape

(20688, 75)

In [560]:
data['price'].isnull().values.any()

False

In [561]:
cols_of_interest = [
    'price', 'host_id', 'host_since', 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
    'host_total_listings_count', 'host_verifications', 'host_has_profile_pic',
    'host_identity_verified', 'latitude', 'longitude', 'room_type', 'accommodates',
    'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
    'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
    'has_availability', 'availability_30', 'availability_60', 'availability_90',
    'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'instant_bookable',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
    'reviews_per_month'
                   ]

In [562]:
# Columns to that require text processing
cols_text = [
    'name', 'description', 'neighborhood_overview', 'host_name', 
    'host_location', 'host_about', 'host_neighbourhood', 'neighbourhood_cleansed',
    'neighbourhood_group_cleansed', 'property_type', 'amenities'
            ]

In [576]:
# Create data subsets
data_numerical = data[cols_of_interest]
data_text = data[cols_text]

data_numerical.shape, data_text.shape

((20688, 50), (20688, 11))

# Ingeniería de Características

## Cleaning numerical columns

In [577]:
# COLS FOR LATER
# 'amenities'            # list of strings lalala
pd.options.mode.copy_on_write = True

In [578]:
data_numerical['price'] =  data_numerical['price'].replace({"[\$,)]":"",",":""}, regex=True).apply(pd.to_numeric)
y = data_numerical['price']
data_numerical = data_numerical.drop(['price'], axis=1)

0         31.0
1         92.0
2        180.0
4         65.0
5         32.0
         ...  
26019    108.0
26020     40.0
26021     28.0
26022     28.0
26023    100.0
Name: price, Length: 20688, dtype: float64

In [566]:
# Calculate years hosting
data_numerical['host_since_year'] = data_numerical['host_since'].apply(lambda x: x.year)
data_numerical['host_years'] = data_numerical['host_since_year'].apply(lambda x: 2024.0 - x)
data_numerical = data_numerical.drop(['host_since', 'host_since_year'], axis=1)

# Calculate years since first review
data_numerical['first_review_year'] = data_numerical['first_review'].apply(lambda x: x.year)
data_numerical['first_review_age'] = data_numerical['first_review_year'].apply(lambda x: 2024.0 - x)
data_numerical = data_numerical.drop(['first_review', 'first_review_year'], axis=1)

# Calculate years since last review
data_numerical['last_review_year'] = data_numerical['last_review'].apply(lambda x: x.year)
data_numerical['last_review_age'] = data_numerical['last_review_year'].apply(lambda x: 2024.0 - x)
data_numerical = data_numerical.drop(['last_review', 'last_review_year'], axis=1)

In [567]:
# Change t f columns to 1 and 0
columns_t_f = ['host_is_superhost','host_has_profile_pic', 'host_identity_verified','has_availability','instant_bookable']
t_f = {'t':'1', 'f':'0'}

for col in columns_t_f:
    data_numerical[col] = data_numerical[col].replace(t_f)

data_numerical[columns_t_f] = data_numerical[columns_t_f].apply(pd.to_numeric)

In [568]:
# Strip % and convert int
columns_percent = ['host_response_rate','host_acceptance_rate']

for col in columns_percent:
    data_numerical[col] = data_numerical[col].str.replace('%', '').apply(pd.to_numeric)

In [569]:
# Convert to category strings to numerical codes
columns_categorical = ['host_response_time', 'room_type']

label_encoder = LabelEncoder()
data_numerical[columns_categorical] = data_numerical[columns_categorical].apply(lambda series: pd.Series(
    LabelEncoder().fit_transform(series[series.notnull()]),
))

In [570]:
# Clean string
data_numerical['host_verifications'] = data_numerical['host_verifications'].str.replace(r"[\'\[\]]", "", regex=True)
data_numerical['host_verifications'] = data_numerical['host_verifications'].str.replace(" ", "")

verification_cols = ['email_verified', 'phone_verified','work_email_verified']

# Expand columns
data_numerical[verification_cols] = data_numerical['host_verifications'].str.split(',', expand=True)

# Convert to 1 and 0
data_numerical[verification_cols] = data_numerical[verification_cols].notnull().astype(float)

# Drop original column
data_numerical = data_numerical.drop(['host_verifications'], axis=1)

# Data exploration

# Final data transformations

In [571]:
# Handle Nulls usint interpolate
for col in data_numerical.columns:
    if data_numerical.isnull().values.any():
        data_numerical[col] = data_numerical[col].astype(float)
        data_numerical[col] = data_numerical[col].interpolate(method='linear')

In [572]:
# Normalize data using min-max normalization
normalized_numerical_data=(data_numerical-data_numerical.min())/(data_numerical.max()-data_numerical.min())

# Train test splits

https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [579]:
X_train, X_test, y_train, y_test = train_test_split(
    normalized_numerical_data, y, test_size=0.33, random_state=42)

In [582]:
#np.random.seed(42)
#from sklearn import linear_model
#br = linear_model.BayesianRidge()
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy: {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
# Baseline
base_model = RandomForestRegressor(random_state=42)
base_model.fit(X_train, y_train)

In [588]:
import joblib
joblib.dump(base_model, 'base_model.pkl', compress=True)

['base_model.pkl']

In [None]:
base_accuracy = evaluate(base_model, X_test, y_test)

In [587]:
# Instantiate model
model = RandomForestRegressor(random_state=42)

# Dictionary of hyperparameters
parameters = {
    'n_estimators':[1, 2, 4, 8, 16, 32, 64, 100, 200],
    'max_features':['sqrt', 'log2', None],
    'random_state':[42],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10],
    'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
            }

# Hyperparameter tuning, automatically does 5-fold cv
cv = GridSearchCV(model, parameters, n_jobs=-1)
cv.fit(X_train, y_train)


KeyboardInterrupt



In [None]:
import joblib
joblib.dump(cv, 'cv.pkl', compress=True)
joblib.dump(cv.best_estimator_, 'cv_best_model.pkl')

In [None]:
cv_best_model = joblib.load('cv_best_model.pkl')

In [None]:
cv_best_model = cv.best_estimator_
cv_accuracy = evaluate(cv_best_model, X_test, y_test)

In [None]:
# Create k-fold object
k_folds = KFold(n_splits=5)

# Get cross validation scores
scores = cross_val_score(rf, X_train, y_train, cv=k_folds)
