## Importaciones

In [2]:
"""Módulo para crear un modelo predictivo de precios de Airbnb en la Comunidad de Madrid."""

import pandas as pd

## Descargar los datos

In [11]:
data_location = "https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz"

In [12]:
!wget https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz

--2024-06-01 19:46:38--  https://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2024-03-22/data/listings.csv.gz
Resolving data.insideairbnb.com (data.insideairbnb.com)... 18.154.48.59, 18.154.48.87, 18.154.48.41, ...
Connecting to data.insideairbnb.com (data.insideairbnb.com)|18.154.48.59|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13297683 (13M) [application/x-gzip]
Saving to: ‘listings.csv.gz’


2024-06-01 19:46:39 (17.1 MB/s) - ‘listings.csv.gz’ saved [13297683/13297683]



In [None]:
# Reasoning behind which columns are removed, and which require text processing
columns =[
    'id',                     # Remove - Identifier
    'listing_url',            # Remove - URL with ID at end
    'scrape_id',              # Remove - details of scrape
    'last_scraped',           # Remove - detail of scrape
    'source',                 # Remove - detail of scrape
    'name',                   # Do text processing
    'description',            # Do text processing
    'neighborhood_overview',  # Do text processing
    'picture_url',            # Remove - URl
    'host_id',                # Could learn relation between host and price
    'host_url',               # Remove - redundant with host_id
    'host_name',              # Do text processing
    'host_since',             # Convert to number of months or years 
    'host_location',          # Do text processing
    'host_about',             # Do text processing
    'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost',
    'host_thumbnail_url',     # Remove
    'host_picture_url',       # Remove
    'host_neighbourhood',     # Do text processing
    'host_listings_count', 'host_total_listings_count',
    'host_verifications',     #
    'host_has_profile_pic', 'host_identity_verified',
    'neighbourhood',          #  Remove - redundant
    'neighbourhood_cleansed', #  Do text processing
    'neighbourhood_group_cleansed',
    'latitude', 'longitude',
    'property_type',          #  Do text processing
    'room_type', 'accommodates', 'bathrooms',
    'bathrooms_text',         # Remove - largely redundant
    'bedrooms', 'beds', 'amenities', 'price',
    'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
    'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm',
    'calendar_updated',       # Remove - 0 not null
    'has_availability',
    'availability_30', 'availability_60', 'availability_90',
    'availability_365',
    'calendar_last_scraped',  # Remove - same across all rows
    'number_of_reviews',
    'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review',  # Convert to days ago
    'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin',
    'review_scores_communication', 'review_scores_location',
    'review_scores_value',
    'license',                # Remove 
    'instant_bookable',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms', 'reviews_per_month'   
]

In [37]:
data = pd.read_csv('listings.csv.gz')
data.shape

(26024, 75)

In [94]:
cols_of_interest = ['host_id', 'host_since', 'host_response_time',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count', 'host_verifications',
    'host_has_profile_pic', 'host_identity_verified', 'latitude', 'longitude',
    'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
    'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
    'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm',
    'maximum_nights_avg_ntm', 'has_availability', 'availability_30', 'availability_60', 'availability_90',
    'availability_365', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy',
    'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication',
    'review_scores_location', 'review_scores_value', 'instant_bookable',
    'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms',
    'reviews_per_month'
                   ]

In [95]:
# Columns to that require text processing
cols_text = ['name', 'description', 'neighborhood_overview', 'host_name', 
    'host_location', 'host_about', 'host_neighbourhood', 'neighbourhood_cleansed',
    'neighbourhood_group_cleansed', 'property_type', 'amenities'
            ]

In [101]:
# Select only columns of interest
data_numerical = data[cols_of_interest]

In [100]:
data_numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024 entries, 0 to 26023
Data columns (total 50 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   host_id                                       26024 non-null  int64  
 1   host_since                                    26020 non-null  object 
 2   host_response_time                            20664 non-null  object 
 3   host_response_rate                            20664 non-null  object 
 4   host_acceptance_rate                          21947 non-null  object 
 5   host_is_superhost                             25897 non-null  object 
 6   host_listings_count                           26020 non-null  float64
 7   host_total_listings_count                     26020 non-null  float64
 8   host_verifications                            26020 non-null  object 
 9   host_has_profile_pic                          26020 non-null 