In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv("./Airbnb-US-2023.csv")
# Give some columns clearer/shorter names
df = df.rename(columns={
    "price": "price_per_night",
    "number_of_reviews": "nr_of_reviews",
    "last_review": "last_review_date",
    "calculated_host_listings_count": "nr_of_listings_by_host",
    "availability_365": "days_per_year_available",
    "number_of_reviews_ltm": "nr_of_reviews_last_twelve_months"
})
df.head()

## Visualizing the locations
### Use the output of the function below with google my maps to plot a number of locations unto a map

In [None]:
def export_locations_to_google_my_maps(dataframe, number_of_locations = 10000):
    """Extracts latitude, longitude, name, and room_type from the dataframe provided.
    The locations are reduced until their number is close to number_of_locations.
    The result is grouped based on room_type, each group is exported to multiple csv files of max length = 2000.
    These are to be imported on Google My Maps for plotting."""
    
    dataframe = dataframe[["latitude", "longitude", "name", "room_type"]]
    
    entire_apartment_df = dataframe[dataframe["room_type"] == "Entire home/apt"].drop("room_type", axis=1)
    private_room_df = dataframe[dataframe["room_type"] == "Private room"].drop("room_type", axis=1)
    shared_room_df = dataframe[dataframe["room_type"] == "Shared room"].drop("room_type", axis=1)
    hotel_room_df = dataframe[dataframe["room_type"] == "Hotel room"].drop("room_type", axis=1)
    
    locations_to_eliminate = len(dataframe) - number_of_locations
    entire_apartment_ratio = len(entire_apartment_df) / len(dataframe)
    private_room_ratio = len(private_room_df) / len(dataframe)
    shared_room_ratio = len(shared_room_df) / len(dataframe)
    hotel_room_ratio = len(hotel_room_df) / len(dataframe)
    
    entire_apartment_locations_to_delete = int(entire_apartment_ratio * locations_to_eliminate)
    private_room_locations_to_delete = int(private_room_ratio * locations_to_eliminate)
    shared_room_locations_to_delete = int(shared_room_ratio * locations_to_eliminate)
    hotel_room_locations_to_delete = int(hotel_room_ratio * locations_to_eliminate)
    
    entire_apartment_locations_to_keep = len(entire_apartment_df) - entire_apartment_locations_to_delete
    private_room_locations_to_keep = len(private_room_df) - private_room_locations_to_delete
    shared_room_locations_to_keep = len(shared_room_df) - shared_room_locations_to_delete
    hotel_room_locations_to_keep = len(hotel_room_df) - hotel_room_locations_to_delete
    
    entire_apartment_df_shortened = entire_apartment_df.sample(entire_apartment_locations_to_keep)
    private_room_df_shortened = private_room_df.sample(private_room_locations_to_keep)
    shared_room_df_shortened = shared_room_df.sample(shared_room_locations_to_keep)
    hotel_room_df_shortened = hotel_room_df.sample(hotel_room_locations_to_keep)
    
    entire_apartment_dfs = np.array_split(entire_apartment_df_shortened, len(entire_apartment_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(entire_apartment_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/apartment{i}.csv")
    
    private_room_dfs = np.array_split(private_room_df_shortened, len(private_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(private_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/private{i}.csv")
    
    shared_room_dfs = np.array_split(shared_room_df_shortened, len(shared_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(shared_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/shared{i}.csv")
    
    hotel_room_dfs = np.array_split(hotel_room_df_shortened, len(hotel_room_df_shortened) // 2000 + 1)
    for i, df_chunk in enumerate(hotel_room_dfs):
        df_chunk.to_csv(f"google-my-maps-csvs/hotel{i}.csv")

## Cleaning the data
### Removing duplicates and irrelevant columns

In [None]:
print(f"Length before removing duplicates: {len(df)}")
relevant_df = df.drop(["id"], axis=1)
relevant_df = relevant_df.drop_duplicates()
# We drop neighbourhood_group because we already have the neighbourhood column which provides more information
relevant_df = relevant_df.drop(["host_id", "host_name", "latitude", "longitude", "neighbourhood_group"], axis=1)
print(f"Length after removing duplicates: {len(relevant_df)}")

### Dealing with missing values and outliers

In [None]:
relevant_df.isnull().any()

In [None]:
print(f"Length before removing irrelevant columns: {len(relevant_df)}")
non_null_df = relevant_df.copy()
non_null_df = non_null_df.dropna(subset=["name"])
non_null_df.last_review_date = non_null_df.last_review_date.fillna(0)
non_null_df.reviews_per_month = non_null_df.reviews_per_month.fillna(0)
non_null_df.loc[non_null_df.last_review_date != 0, "last_review_date"] = pd.to_datetime(non_null_df[non_null_df.last_review_date != 0].last_review_date).apply(lambda x: int(x.timestamp()))
print(f"Length after removing irrelevant columns: {len(non_null_df)}")

In [None]:
# Create a histogram of every column that could have outliers to see which ones have outliers
# Alongside there will be plotted 2 vertical lines representing the bounds for eliminating outliers
columns_used_for_checking_outliers = non_null_df.drop(["name", "neighbourhood", "room_type", "city"], axis=1).columns

rows_of_subplots = int(len(columns_used_for_checking_outliers) / 2)
columns_of_subplots = 2

fig, axes = plt.subplots(rows_of_subplots, columns_of_subplots, figsize=(14, 10))
fig.subplots_adjust(hspace=0.9, wspace=0.2)
axes = axes.flatten()

for subplot_index, column_name in enumerate(columns_used_for_checking_outliers):
    ax = axes[subplot_index]
    ax.hist(non_null_df[column_name], bins=20, rwidth=0.8)
    
    mean = non_null_df[column_name].mean()
    standard_deviation = non_null_df[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)

    ax.axvline(x=lower_bound, color='b')
    ax.axvline(x=upper_bound, color='b')
    
    # most subplots need to be scaled logarithmically because there is one range of values that is much more frequent
    # than any other, thus the subplot won't give too much information unless it is scaled
    if column_name != "days_per_year_available":
        ax.set_yscale("log")
    
    ax.set_xlabel(column_name)
    ax.set_ylabel("frequency")
    ax.set_title(f"Distribution of {column_name}")
    
plt.show()

In [None]:
# last_review_date and days_per_year_available will be excluded from outlier removal because they don't have any outliers

columns_used_for_removing_outliers = non_null_df.drop(["name", "neighbourhood", "room_type", "city", "last_review_date", "days_per_year_available"], axis=1).columns

print(f"Length before removing outliers: {len(non_null_df)}\n")

no_outliers_df = non_null_df.copy()

for column_name in columns_used_for_removing_outliers:
    mean = no_outliers_df[column_name].mean()
    standard_deviation = no_outliers_df[column_name].std()
    
    lower_bound = mean - (3 * standard_deviation)
    upper_bound = mean + (3 * standard_deviation)
    
    percentage_removed = round((((no_outliers_df[column_name] < lower_bound) | (no_outliers_df[column_name] > upper_bound)).sum() / len(no_outliers_df)) * 100, 2)

    print(f"For column {column_name}, removing a percentage of {percentage_removed}% values.")
    no_outliers_df = no_outliers_df[(lower_bound <= no_outliers_df[column_name]) & (no_outliers_df[column_name] <= upper_bound)]

print(f"\nLength after removing outliers: {len(no_outliers_df)}")

## Feature engineering

### Extracting meaningful words from name column

In [None]:
name_meaningful_words_df = no_outliers_df.copy()
name_meaningful_words_df.name = no_outliers_df.name.str.lower()
# Extract number of bedrooms in a separate column
name_meaningful_words_df["nr_of_bedrooms"] = name_meaningful_words_df.name.str.extract('(\d+)\s*-?\+?(?:br|bd|bed)')
name_meaningful_words_df["has_nr_of_bedrooms"] = ~name_meaningful_words_df["nr_of_bedrooms"].isnull()
name_meaningful_words_df["nr_of_bedrooms"] = name_meaningful_words_df["nr_of_bedrooms"].fillna(0)

def most_frequent_words_in_name_column(dataframe):
    # filter anything that is not helpful, or any words that are related to location
    filtered_strings = {"in", "private", "to", "bedroom", "the", "room", "home", "with", "apartment", "near", "and", "house", "of", 
    "a", "from", "bed", "apt", "bath", "close", "view", "suite", "on", "walk", "location", "for", "east", "by", "at", "hollywood", 
    "city", "one", "austin", "unit", "br", "nr", "west", "s", "d", "dt", "no", "bd", "b", "la", "full", "brooklyn", "san", "brand", 
    "hill", "steps", "minutes", "bathroom", "strip", "vegas", "manhattan", "prime", "las", "hills", "south", "mins", "hot", "guest", 
    "two", "free", "min", "hotel", "nyc", "place", "w", "floor", "stay", "nashville", "space", "shared", "away", "north", "entire", 
    "beds", "sleeps", "bay", "williamsburg", "side", "living", "kitchen", "rental", "square", "district", "located", "street", "area", "your", 
    "upper", "neighborhood", "pet", "st", "bdrm", "venice", "los", "style", "santa", "bedrooms", "blocks", "heights", "diego", 
    "block", "next", "miles", "capitol", "long", "centre", "dtla", "top", "broadway", "seattle", "mission", "all"}

    word_frequencies = no_outliers_df.name.str.split('\\W+', expand=True).stack().value_counts()
    word_frequencies = word_frequencies[(word_frequencies >= 1000) & word_frequencies.index.str.isalpha()]
    word_frequencies = word_frequencies[~word_frequencies.index.isin(filtered_strings)]
    
    return word_frequencies

most_frequent_words_in_name_column(name_meaningful_words_df)

In [None]:
# Frequent words have to be manually extracted because similar words were grouped together to reduce number of columns
# And some were removed due to not being useful
meaningful_words_with_column_name = [
    ["peaceful", ["quiet", "oasis", "retreat", "getaway", "peaceful"]],
    ["luxurios", ["luxury", "luxurious", "resort", "deluxe"]],
    ["lovely", ["lovely", "charming", "cute", "nice"]],
    ["spacious", ["spacious", "large", "huge", "big"]],
    ["comfortable", ["cozy", "comfy", "comfortable"]],
    ["central", ["downton", "heart", "central", "midtown"]],
    ["amazing", ["great", "amazing", "best", "perfect", "paradise"]],
    ["beautiful", ["beautiful", "gorgeous", "stunning"]],
    ["bright", ["sunny", "bright", "cheerful"]],
    ["renovated", ["renovated", "remodeled", "newly"]],
    ["backyard", ["garden", "backyard", "yard"]],
    ["beach", ["beach", "ocean"]],
    ["pool", ["pool", "tub"]],
    ["patio", ["deck", "patio"]],
    ["stylish", ["stylish", "chic"]],
    ["convenient", ["convenient", "everything"]],
    ["townhouse", ["townhouse", "townhome"]],
    ["urban", ["urban", "town",]],
    ["modern", ["modern", "new",]],
    ["village", ["village"]],
    ["cottage", ["cottage"]],
    ["studio", ["studio"]],
    ["condo", ["condo"]],
    ["bungalow", ["bungalow"]],
    ["villa", ["villa"]],
    ["penthouse", ["penthouse"]],
    ["duplex", ["duplex"]],
    ["loft", ["loft"]],
    ["king", ["king"]],
    ["queen", ["queen"]],
    ["master", ["master"]],
    ["gym", ["gym"]],
    ["entrance", ["entrance"]],
    ["balcony", ["balcony"]],
    ["rooftop", ["rooftop"]],
    ["mid", ["mid"]],
    ["victorian", ["victorian"]],
    ["historic", ["historic"]],
    ["gem", ["gem"]],
    ["clean", ["clean"]],
    ["furnished", ["furnished"]],
    ["family", ["family"]],
    ["friendly", ["friendly"]],
    ["views", ["views"]],
    ["valley", ["valley"]],
    ["park", ["park"]],
    ["lake", ["lake"]],
    ["waterfront", ["waterfront"]],
    ["vacation", ["vacation"]],
    ["spa", ["spa"]],
    ["heated", ["heated"]],
    ["airport", ["airport"]],
    ["parking", ["parking"]],
    ["wifi", ["wifi"]]
]

for column_name, values_to_search in meaningful_words_with_column_name:
    name_meaningful_words_df[column_name] = name_meaningful_words_df.name.str.split('\\W+').apply(lambda cur_word: any(value_to_search in cur_word for value_to_search in values_to_search))

name_meaningful_words_df.to_csv("it_did_go_well.csv")

In [None]:
something_export = no_outliers_df[no_outliers_df.name.str.contains("\d")]
something_export.to_csv("something_something.csv")

In [None]:
something = no_outliers_df.copy()
something["nr_of_bedrooms"] = something.name.str.extract('(\d+)\s*-?\+?(?:br|bd|bed)', expand=False)
something["has_nr_of_bedrooms"] = ~something["nr_of_bedrooms"].isnull()
something["nr_of_bedrooms"] = something["nr_of_bedrooms"].fillna(0)
something.to_csv("nr_of_bedrooms.csv")
something

### One-hot encoding

In [None]:
# One-hot encoding the room_type, city and neighbourhood columns
room_type_dummies = pd.get_dummies(no_outliers_df.room_type, drop_first=True)
city_dummies = pd.get_dummies(no_outliers_df.city, drop_first=True)
#neighbourhood_dummies = pd.get_dummies(no_outliers_df.neighbourhood, drop_first=True)
#final_df = pd.concat([no_outliers_df, room_type_dummies, city_dummies, neighbourhood_dummies], axis="columns")
final_df = pd.concat([no_outliers_df, room_type_dummies, city_dummies], axis="columns")
final_df = final_df.drop(["neighbourhood", "room_type", "city"], axis=1)
final_df.head()

In [None]:
final_df = final_df.head(10000)

In [None]:
len(final_df)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVR

X = final_df.drop("price_per_night", axis=1)
y = final_df.price_per_night

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LinearRegression())
    ],
    'params': {}
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

## Testing ml models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
from sklearn.svm import SVR

X = final_df.drop("price_per_night", axis=1)
y = final_df.price_per_night

models = {
  'linear_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', linear_model.LinearRegression())
    ],
    'params': {}
  },
  'suppor_vector_regression': {
    'steps': [
        ('scaler', MinMaxScaler()),
        ('regressor', SVR())
    ],
    'params': {
        'regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'regressor__C': [0.1, 10],
        'regressor__epsilon': [0.1, 0.001],
        'regressor__gamma': ['scale', 'auto']
    }
  }
}

scores = []

for model_name, options in models.items():
    print(f"checking model {model_name}")
    pipeline = Pipeline(options["steps"])
    grid_search = GridSearchCV(pipeline, options["params"], cv=5, return_train_score=False, verbose = 4)
    
    grid_search.fit(X, y)
    
    scores.append({
        'model': model_name,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    })

df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df