#### Prepared for Gabor's Data Analysis

### Data Analysis for Business, Economics, and Policy
by Gabor Bekes and  Gabor Kezdi
 
Cambridge University Press 2021

**[gabors-data-analysis.com ](https://gabors-data-analysis.com/)**

 License: Free to share, modify and use for educational purposes. 
 Not to be used for commercial purposes.
 
cleaning London airbnb file

v.1.3. 2021-05-17 paths changed


in: data from web

out: airbnb_london_cleaned.csv

In [25]:
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")

In [26]:
print(os.getcwd()) 

/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1


In [27]:
data_dir = "/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/"

# location of folders
data_in = data_dir + "data/raw/"
data_out = data_dir + "data/clean/"

In [28]:
# zero step
data = pd.read_csv(data_in + "listings.csv.gz", delimiter=",", dtype="unicode")
drops = [
    "host_thumbnail_url",
    "host_picture_url",
    "listing_url",
    "picture_url",
    "host_url",
    "last_scraped",
    "description",
    "neighborhood_overview",
    "host_about",
    "host_response_time",
    "name",
    "host_location",
]
data.drop(columns=drops, inplace=True)
data.to_csv(data_in + "airbnb_madrid_listing.csv", index=False)

In [64]:
# opening dataset
df = pd.read_csv(data_in + "airbnb_madrid_listing.csv", delimiter=",")

In [65]:
# drop broken lines - where id is not a character of numbers
df.id = pd.to_numeric(df.id, errors="coerce")
df = df[df.id.notna()]

In [66]:
# display the class and type of each columns
df.dtypes

id                                                int64
scrape_id                                         int64
source                                           object
host_id                                           int64
host_name                                        object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 63, dtype: object

In [67]:
#####################
# formatting columns
for perc in ["host_response_rate", "host_acceptance_rate"]:
    df[perc] = pd.to_numeric(df[perc], errors="coerce")

In [68]:
#remove dollar sign from price
if 'price' in df.columns:
    df['price'] = df['price'].dropna().astype(str).str.replace(r"\$", "", regex=True)
    df['price'] = pd.to_numeric(df['price'], errors="coerce")

# Check results
print(df['price'].head(10))
print(df['price'].isnull().sum())  # Count of remaining null values
print(df['price'].dtype)  # Should be float64

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
8    150.0
9      NaN
Name: price, dtype: float64
6018
float64


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26980 entries, 0 to 26979
Data columns (total 63 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            26980 non-null  int64  
 1   scrape_id                                     26980 non-null  int64  
 2   source                                        26980 non-null  object 
 3   host_id                                       26980 non-null  int64  
 4   host_name                                     26977 non-null  object 
 5   host_since                                    26977 non-null  object 
 6   host_response_rate                            0 non-null      float64
 7   host_acceptance_rate                          0 non-null      float64
 8   host_is_superhost                             26001 non-null  object 
 9   host_neighbourhood                            9401 non-null  

In [70]:
# format binary variables
for binary in [
    "host_is_superhost",
    "host_has_profile_pic",
    "host_identity_verified",
    "instant_bookable",
]:
    df[binary] = df[binary].map({"t": True, "f": False})

In [71]:
# amenities
df["amenities"] = df["amenities"].str.strip("{}").str.replace('"', "").str.split(",")

In [72]:
# generate dummies by amenities

# # Generate dummies from amenities column
# dummies = pd.get_dummies(df["amenities"].apply(pd.Series).stack()).groupby(level=0).sum()

# # Concatenate the new dummies with the original DataFrame
# df = pd.concat([df, dummies], axis=1)

# df.drop(columns=[
#     "", 
#     "amenities", 
#     "translation missing: en.hosting_amenity_49", 
#     "translation missing: en.hosting_amenity_50"
# ], inplace=True, errors="ignore")

# # Check result
# print(df.head())
# print(sorted(df["amenities"].explode().unique()))
# # dummies = pd.get_dummies(df.amenities.apply(pd.Series).stack()).sum(level=0)
# # df = pd.concat([df, dummies], axis=1)


# drops = [
#     "",
#     "amenities",
#     "translation missing: en.hosting_amenity_49",
#     "translation missing: en.hosting_amenity_50",
# ]
# df.drop(columns=drops, inplace=True)

In [73]:

# Standardize amenities before creating dummies
df["amenities"] = (
    df["amenities"]
    .astype(str)  # Ensure it's a string
    .str.lower()  # Convert to lowercase for consistency
    .str.replace(r"[\[\]\{\}]", "", regex=True)  # Remove brackets
    .str.replace(r"\s+", " ", regex=True)  # Remove extra spaces
    .str.strip()  # Trim spaces at the beginning/end
)

# Convert amenities into separate columns
dummies = pd.get_dummies(df["amenities"].apply(lambda x: pd.Series(x.split(","))).stack()).groupby(level=0).sum()

# Concatenate with original DataFrame
df = pd.concat([df, dummies], axis=1)

# Drop the original 'amenities' column
df.drop(columns=["amenities"], inplace=True, errors="ignore")

In [77]:
# Check results
print(df.columns)

Index(['id', 'scrape_id', 'source', 'host_id', 'host_name', 'host_since',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_neighbourhood',
       ...
       ''toaster'', ''tv with amazon prime video'', ''tv with dvd player'',
       ''tv with hbo max'', ''tv'', ''varias body soap'',
       ''washer \\u2013\\u00a0in unit'', ''washer'', ''wifi'',
       ''wine glasses''],
      dtype='object', length=3218)


In [78]:
# MINOR STUFF
# data changed marginally, to make it compatible with textbook, we'll drop 27 rows.
not_in_book = pd.read_csv(data_in + "not_in_book.csv", delimiter=",", dtype="unicode")
df = df[~df.id.isin(not_in_book.id.astype(int))].reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ghadena/Desktop/Business analytics/DA3/DA3/assignment_1/data/raw/not_in_book.csv'

In [79]:
# write csv
df.to_csv(data_out + "airbnb_madrid_cleaned.csv", index=False)