In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("./Airbnb-US-2023.csv")
df.head()

## Visualizing the locations
### Use the output of the function below with google my maps to plot a number of locations unto a map

In [None]:
def export_locations_to_google_my_maps(dataframe, number_of_locations = 10000):
    """Extracts latitude, longitude and name from the dataframe provided.
    The locations are reduced until their number is close to number_of_locations.
    The result is grouped based on room_type, each group is exported to multiple csv files of max length = 2000.
    These are to be imported on Google My Maps for plotting."""
    
    dataframe = dataframe[["latitude", "longitude", "name", "room_type"]]
    
    entire_apartment_df = dataframe[dataframe["room_type"] == "Entire home/apt"].drop("room_type", axis=1)
    private_room_df = dataframe[dataframe["room_type"] == "Private room"].drop("room_type", axis=1)
    shared_room_df = dataframe[dataframe["room_type"] == "Shared room"].drop("room_type", axis=1)
    hotel_room_df = dataframe[dataframe["room_type"] == "Hotel room"].drop("room_type", axis=1)
    
    locations_to_eliminate = len(dataframe) - number_of_locations
    entire_apartment_ratio = len(entire_apartment_df) / len(dataframe)
    private_room_ratio = len(private_room_df) / len(dataframe)
    shared_room_ratio = len(shared_room_df) / len(dataframe)
    hotel_room_ratio = len(hotel_room_df) / len(dataframe)
    
    entire_apartment_locations_to_delete = int(entire_apartment_ratio * locations_to_eliminate)
    private_room_locations_to_delete = int(private_room_ratio * locations_to_eliminate)
    shared_room_locations_to_delete = int(shared_room_ratio * locations_to_eliminate)
    hotel_room_locations_to_delete = int(hotel_room_ratio * locations_to_eliminate)
    
    entire_apartment_locations_to_keep = len(entire_apartment_df) - entire_apartment_locations_to_delete
    private_room_locations_to_keep = len(private_room_df) - private_room_locations_to_delete
    shared_room_locations_to_keep = len(shared_room_df) - shared_room_locations_to_delete
    hotel_room_locations_to_keep = len(hotel_room_df) - hotel_room_locations_to_delete
    
    entire_apartment_df_shortened = entire_apartment_df.sample(entire_apartment_locations_to_keep)
    private_room_df_shortened = private_room_df.sample(private_room_locations_to_keep)
    shared_room_df_shortened = shared_room_df.sample(shared_room_locations_to_keep)
    hotel_room_df_shortened = hotel_room_df.sample(hotel_room_locations_to_keep)
    
    entire_apartment_dfs = np.array_split(entire_apartment_df_shortened, len(entire_apartment_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(entire_apartment_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/apartment{i}.csv")
    
    private_room_dfs = np.array_split(private_room_df_shortened, len(private_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(private_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/private{i}.csv")
    
    shared_room_dfs = np.array_split(shared_room_df_shortened, len(shared_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(shared_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/shared{i}.csv")
    
    hotel_room_dfs = np.array_split(hotel_room_df_shortened, len(hotel_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(hotel_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/hotel{i}.csv")

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
relevant_df = df.drop(["id"], axis=1)
relevant_df = relevant_df.drop_duplicates()
# We drop neighbourhood_group because we already have the neighbourhood column which provides more information
relevant_df = relevant_df.drop(["name", "host_id", "host_name", "latitude", "longitude", "neighbourhood_group"], axis=1)

### Dealing with missing values and outliers

In [None]:
relevant_df.isnull().any()

In [None]:
# The last_review dates will be converted to unix timestamps
# Since the timestamp being 0 isn't accurate, we will also add a has_reviews column to the dataset to handle this scenario
relevant_df["has_reviews"] = relevant_df.last_review.notnull().astype(int)
relevant_df.last_review = relevant_df.last_review.fillna(0)
relevant_df.reviews_per_month = relevant_df.reviews_per_month.fillna(0)
relevant_df.loc[relevant_df.has_reviews == 1, "last_review"] = pd.to_datetime(relevant_df[relevant_df.has_reviews == 1].last_review).apply(lambda x: x.timestamp())

In [None]:
relevant_df.head(50)

In [None]:
columns = relevant_df.columns

# Set the number of rows and columns for subplots
num_rows = int(len(columns) / 2)  # Number of rows (adjust as needed)
num_cols = 2  # Number of columns (adjust as needed)

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(12, 8))  # Adjust the figsize as needed

# Flatten the axes array for ease of iteration
axes = axes.flatten()

# Iterate over the columns and create plots
for i, column in enumerate(columns):
    ax = axes[i]  # Get the current axis
    print(df[column])
    
    # Plot the current column
    ax.hist(df[column], bins=20, rwidth=0.8)  # Adjust the bin count as needed
    
    # Set labels and title
    ax.set_xlabel(column)
    ax.set_ylabel('Count')
    ax.set_title(f'Histogram of {column}')
    plt.yscale("log")
    
# Adjust the spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(15, 4))
ax.hist(relevant_df.price, bins=20, rwidth=0.8)

# Add labels and title
ax.set_xlabel('Prices')
ax.set_ylabel('Count')
plt.yscale("log")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))
ax.hist(relevant_df.minimum_nights, bins=20, rwidth=0.8)

# Add labels and title
ax.set_xlabel('Minimum nights')
ax.set_ylabel('Count')
plt.yscale("log")

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))
ax.hist(relevant_df.minimum_nights, bins=20, rwidth=0.8)

# Add labels and title
ax.set_xlabel('Minimum nights')
ax.set_ylabel('Count')
plt.yscale("log")

plt.show()

In [None]:
from matplotlib import pyplot as plt

plt.hist(df.price, bins=20, rwidth=0.8)
plt.figure(figsize=(10, 6))
plt.xlabel("Prices")
plt.ylabel("Count")
plt.yscale('log')

plt.show()

In [None]:
Q1 = df.price.quantile(0.25)
Q3 = df.price.quantile(0.75)

# Calculate the IQR
IQR = Q3 - Q1

# Define the lower and upper bounds for outlier detection
lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

# Filter the DataFrame based on the bounds
filtered_df = df[(df.price >= lower_bound) & (df.price <= upper_bound)]

In [None]:
lower_bound, upper_bound

In [None]:
len(filtered_df), len(df)

In [None]:
filtered_df.describe()

In [None]:
from matplotlib import pyplot as plt

plt.hist(filtered_df.price, bins=20, rwidth=0.8)
plt.xlabel("Prices")
plt.ylabel("Count")

plt.show()

In [None]:
df = df[df.price <= 1250]
df = df[df.minimum_nights <= 62]
df = df[df.calculated_host_listings_count <= 150]
df = df[df.reviews_per_month <= 5]
len(df)

In [None]:
df[df.calculated_host_listings_count > 100]

In [None]:
df.describe()

In [None]:
df.number_of_reviews.describe()