In [None]:
# Rooms CSV ETL
import ast


In [None]:
import pandas as pd

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the CSV file into a pandas DataFrame
try:
  df = pd.read_csv('/content/drive/My Drive/data/rooms_data.csv')
  print("Successfully loaded rooms_data.csv")
  print(df.head()) # Example: Print the first few rows

except FileNotFoundError:
  print("Error: rooms_data.csv not found in /content/drive/My Drive/data/")
except pd.errors.EmptyDataError:
  print("Error: rooms_data.csv is empty")
except pd.errors.ParserError:
  print("Error: Could not parse rooms_data.csv. Check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Successfully loaded rooms_data.csv
                                                 url  \
0  https://www.airbnb.mx/rooms/921201790656718448...   
1  https://www.airbnb.mx/rooms/20591959?adults=1&...   
2  https://www.airbnb.mx/rooms/31534932?adults=1&...   
3  https://www.airbnb.mx/rooms/100217845881523184...   
4  https://www.airbnb.mx/rooms/584270446306412779...   

                                      title  price_per_night currency  \
0             Cómodo cuarto en la Del Valle            550.0      MXN   
1               Habitacion en Casa Sabina!!            428.0      MXN   
2  Punta Alta, Refugio dentro de la Ciudad.            742.0      MXN   
3                Habitación en coto privado            230.0      MXN   
4    Mini suites impresionantes | Col. Roma            657.0      MXN   

               city             state country  \
0  Ciudad d

## Overview of the dataset

In [None]:
df.head()

Unnamed: 0,url,title,price_per_night,currency,city,state,country,amenities,rating_cleanliness,rating_accuracy,rating_check_in,rating_communication,rating_location,rating_value,rating_overall,total_reviews,created_at
0,https://www.airbnb.mx/rooms/921201790656718448...,Cómodo cuarto en la Del Valle,550.0,MXN,Ciudad de México,Distrito Federal,México,"[""Ropa de cama incluida"", ""Caja de seguridad"",...",4.9,4.9,5.0,4.9,4.9,4.9,4.92,88,2025-01-27T13:39:48.130023
1,https://www.airbnb.mx/rooms/20591959?adults=1&...,Habitacion en Casa Sabina!!,428.0,MXN,Guadalajara,Jalisco,México,"[""Televisión"", ""Caja de seguridad"", ""Lavadora ...",4.7,4.8,4.9,4.8,4.9,4.7,4.71,717,2025-01-27T13:39:54.046532
2,https://www.airbnb.mx/rooms/31534932?adults=1&...,"Punta Alta, Refugio dentro de la Ciudad.",742.0,MXN,Guadalajara,Jalisco,México,"[""Aire acondicionado y calefacción"", ""Espacios...",,,,,,,4.81,258,2025-01-27T13:40:32.813918
3,https://www.airbnb.mx/rooms/100217845881523184...,Habitación en coto privado,230.0,MXN,Guadalajara,Jalisco,México,"[""Estacionamiento gratuito en las instalaciones""]",4.6,4.7,4.9,5.0,4.8,4.9,4.81,106,2025-01-27T13:40:39.807681
4,https://www.airbnb.mx/rooms/584270446306412779...,Mini suites impresionantes | Col. Roma,657.0,MXN,Ciudad de México,Distrito Federal,México,"[""Aire acondicionado y calefacción"", ""Lavadora...",,,,,,,4.44,586,2025-01-27T13:40:46.039264


In [None]:
# Display basic information about the DataFrame
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71 entries, 0 to 70
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   url                   71 non-null     object 
 1   title                 71 non-null     object 
 2   price_per_night       71 non-null     float64
 3   currency              71 non-null     object 
 4   city                  71 non-null     object 
 5   state                 71 non-null     object 
 6   country               71 non-null     object 
 7   amenities             71 non-null     object 
 8   rating_cleanliness    35 non-null     float64
 9   rating_accuracy       35 non-null     float64
 10  rating_check_in       35 non-null     float64
 11  rating_communication  35 non-null     float64
 12  rating_location       35 non-null     float64
 13  rating_value          35 non-null     float64
 14  rating_overall        71 non-null     float64
 15  total_reviews         71 

In [None]:
print(f"Shape of the DataFrame: {df.shape}")


Shape of the DataFrame: (71, 17)


In [None]:

# Check for missing values
print(df.isnull().sum())

url                      0
title                    0
price_per_night          0
currency                 0
city                     0
state                    0
country                  0
amenities                0
rating_cleanliness      36
rating_accuracy         36
rating_check_in         36
rating_communication    36
rating_location         36
rating_value            36
rating_overall           0
total_reviews            0
created_at               0
dtype: int64


We can appreciate that there are several null specific ratings (ex. "rating cleanliness"), which might mean that this specific ratings were not mandatory and thus left null when leaving a review. When using this dataset for our application, we need to consider this is is not a required value.

Checking values

In [None]:
unique_cities = df['city'].nunique()
print(f"Number of unique cities: {unique_cities}")

names_of_unqiue_cities = df['city'].unique()
print(f"Names of unique cities: {names_of_unqiue_cities}")

Number of unique cities: 16
Names of unique cities: ['Ciudad de México' 'Guadalajara' 'Monterrey' 'Morelia' 'Acapulco'
 'Puerto Vallarta' 'León' 'Tijuana' 'Santiago de Querétaro'
 'San Luis Potosí' 'Mazatlán' 'Guanajuato' 'Puebla' 'Aguascalientes'
 'Toluca' 'San Miguel de Allende']


In [None]:
df['amenities'][0]

'["Ropa de cama incluida", "Caja de seguridad", "Wi-Fi"]'

In [None]:
df['amenities'] = df['amenities'].apply(ast.literal_eval)

In [None]:
unique_amenities = set(amenity for amenities_list in df['amenities'] for amenity in amenities_list)

# Output the result
print(unique_amenities)

{'Aire acondicionado y calefacción', 'Estacionamiento gratuito en las instalaciones', 'Caja de seguridad', 'Ropa de cama incluida', 'Lavadora y secadora', 'Televisión', 'Wi-Fi', 'Cocina equipada', 'Baño privado', 'Espacios al aire libre'}
