In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("./Airbnb-US-2023.csv")
df.head()

## Visualizing the locations
### Use the output of the function below with google my maps to plot a number of locations unto a map

In [None]:
def export_locations_to_google_my_maps(dataframe, number_of_locations = 10000):
    """Extracts latitude, longitude and name from the dataframe provided.
    The locations are reduced until their number is close to number_of_locations.
    The result is grouped based on room_type, each group is exported to multiple csv files of max length = 2000.
    These are to be imported on Google My Maps for plotting."""
    
    dataframe = dataframe[["latitude", "longitude", "name", "room_type"]]
    
    entire_apartment_df = dataframe[dataframe["room_type"] == "Entire home/apt"].drop("room_type", axis=1)
    private_room_df = dataframe[dataframe["room_type"] == "Private room"].drop("room_type", axis=1)
    shared_room_df = dataframe[dataframe["room_type"] == "Shared room"].drop("room_type", axis=1)
    hotel_room_df = dataframe[dataframe["room_type"] == "Hotel room"].drop("room_type", axis=1)
    
    locations_to_eliminate = len(dataframe) - number_of_locations
    entire_apartment_ratio = len(entire_apartment_df) / len(dataframe)
    private_room_ratio = len(private_room_df) / len(dataframe)
    shared_room_ratio = len(shared_room_df) / len(dataframe)
    hotel_room_ratio = len(hotel_room_df) / len(dataframe)
    
    entire_apartment_locations_to_delete = int(entire_apartment_ratio * locations_to_eliminate)
    private_room_locations_to_delete = int(private_room_ratio * locations_to_eliminate)
    shared_room_locations_to_delete = int(shared_room_ratio * locations_to_eliminate)
    hotel_room_locations_to_delete = int(hotel_room_ratio * locations_to_eliminate)
    
    entire_apartment_locations_to_keep = len(entire_apartment_df) - entire_apartment_locations_to_delete
    private_room_locations_to_keep = len(private_room_df) - private_room_locations_to_delete
    shared_room_locations_to_keep = len(shared_room_df) - shared_room_locations_to_delete
    hotel_room_locations_to_keep = len(hotel_room_df) - hotel_room_locations_to_delete
    
    entire_apartment_df_shortened = entire_apartment_df.sample(entire_apartment_locations_to_keep)
    private_room_df_shortened = private_room_df.sample(private_room_locations_to_keep)
    shared_room_df_shortened = shared_room_df.sample(shared_room_locations_to_keep)
    hotel_room_df_shortened = hotel_room_df.sample(hotel_room_locations_to_keep)
    
    entire_apartment_dfs = np.array_split(entire_apartment_df_shortened, len(entire_apartment_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(entire_apartment_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/apartment{i}.csv")
    
    private_room_dfs = np.array_split(private_room_df_shortened, len(private_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(private_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/private{i}.csv")
    
    shared_room_dfs = np.array_split(shared_room_df_shortened, len(shared_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(shared_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/shared{i}.csv")
    
    hotel_room_dfs = np.array_split(hotel_room_df_shortened, len(hotel_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(hotel_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/hotel{i}.csv")

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
df = df.drop(["id"], axis=1)
df = df.drop_duplicates()
df = df.drop(["name", "host_id", "host_name", "latitude", "longitude"], axis=1)

### Dealing with missing values and outliers

In [None]:
df.describe()

In [None]:
df = df[df.price <= 1250]
df = df[df.minimum_nights <= 62]
df = df[df.calculated_host_listings_count <= 150]
len(df)

In [None]:
df[df.calculated_host_listings_count > 100]

In [None]:
df.describe()

In [None]:
df.number_of_reviews.describe()