In [29]:
import pandas as pd

In [30]:
# Read the CSV files
cal = pd.read_csv('data/raw/calendar2024.csv')
lis = pd.read_csv('data/raw/listings2024.csv') 
rev = pd.read_csv('data/raw/reviews2024.csv')
print("Listings shape:", lis.shape)
print("Calendar shape:", cal.shape) 
print("Reviews shape:", rev.shape)

Listings shape: (20909, 75)
Calendar shape: (7631731, 7)
Reviews shape: (366636, 6)


## TODO
- [ ] Many variables are stored as generic objects, instead of their actual datatype
- [ ] datetime columns are stored as objects
- [ ] lis.neighbourhood_cleansed has spelling mistakes
- [ ] first name of reviewer/host seems irrelevant, when unique-ID is available
- [ ] NaN, missing, and null values are inconsistent
- [ ] Boolean is expressed as t/f
- [ ] Price is stored as a string in an arbitrary currency (is it $ or local?)
- [ ] lis csv. host_verifications, and amenities are categorical, but are stored as objects. Depending on how many amenities there are, it may/may not be better to store as a categorical – but host_verification is a short enough list. Same goes for source.

**lis csv** 
- property_type, and room_type are mapped as obj., is categorial
- bathroom and bathrooms_text are the same, but bathrooms_text has some values that are not in bathroom.
- bedrooms, and beds	are stored as float, but is int.

- [ ] ETL: Extract, Transform, Load

---
> Attributes of datasets identified at initial look
- Primary keys: cal.listing_id, list.id, rev.listing_id


# Clean

In [None]:
null_cols_lis = lis.columns[lis.isna().all()].tolist()
lis = lis.drop(columns=null_cols_lis)

lis.drop(columns=['scrape_id', 'host_name', 'picture_url', 'host_url', 'host_thumbnail_url', 'host_picture_url'], inplace=True)
cal.drop(columns=['adjusted_price'], inplace=True)
rev.drop(columns=['reviewer_name'], inplace=True)


def convert_to_boolean(df, columns, true_value='t'):
    """Convert specified columns from string indicators to boolean"""
    for col in columns:
        df[col] = df[col] == true_value
    return df

boolean_cols = ['instant_bookable', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'has_availability']
lis = convert_to_boolean(lis, boolean_cols)
cal['available'] = cal['available'] == 't'


def convert_to_datetime(df, columns):
    """Convert specified columns to datetime"""
    for col in columns:
        df[col] = pd.to_datetime(df[col])
    return df

datetime_cols_lis = ['calendar_last_scraped', 'first_review', 'last_review', 'last_scraped', 'host_since']
lis = convert_to_datetime(lis, datetime_cols_lis)
cal['date'] = pd.to_datetime(cal['date'])
rev['date'] = pd.to_datetime(rev['date'])


def convert_to_type(df, columns, dtype):
    """Convert specified columns to given dtype"""
    for col in columns:
        df[col] = df[col].astype(dtype)
    return df

string_columns = ['bathrooms_text', 'neighbourhood', 'neighbourhood_cleansed', 'property_type', 'room_type', 'host_location', 'host_about', 'host_neighbourhood', 'listing_url', 'host_response_time', 'source', 'name','description','neighborhood_overview']
lis = convert_to_type(lis, string_columns, "string")
rev['comments'] = rev['comments'].astype("string")


percentage_cols = ['host_response_rate', 'host_acceptance_rate']
for col in percentage_cols:
    lis = lis.rename(columns={col: f"{col}_pct"})
    lis[f"{col}_pct"] = lis[f"{col}_pct"].str.rstrip('%').astype('float') / 100

# Currency inconsistency adjustment
lis['price'] = lis['price'].str.replace(r'[\$,]', '', regex=True)
lis = lis.rename(columns={'price': 'price_DKK'})
lis['price_DKK'] = pd.to_numeric(lis['price_DKK'], errors='coerce')

cal['price'] = cal['price'].str.replace(r'[\$,]', '', regex=True)
cal = cal.rename(columns={'price': 'price_USD'})
cal['price_USD'] = pd.to_numeric(cal['price_USD'], errors='coerce')

########## Handling list columns ##########
# Count amenities and add new column
lis['amenities_count'] = lis.amenities.str.strip('[]').str.split(',').str.len()
# Encoding host_verifications
# First clean up the string representation of lists
lis['host_verifications'] = lis['host_verifications'].str.strip('[]').str.replace("'", "").str.split(', ')
# Create one-hot encoded columns
verification_dummies = lis['host_verifications'].str.join('|').str.get_dummies()
# Add prefix to avoid column name conflicts
verification_dummies = verification_dummies.add_prefix('verification_')
# Join with original dataframe if needed
lis = pd.concat([lis, verification_dummies], axis=1)

print("Listings shape:", lis.shape)
print("Calendar shape:", cal.shape) 
print("Reviews shape:", rev.shape)

Listings shape: (20909, 66)
Calendar shape: (7631731, 6)
Reviews shape: (366636, 5)


In [38]:
lis.dtypes.value_counts()

int64             20
float64           20
string[python]    14
datetime64[ns]     5
bool               5
object             2
Name: count, dtype: int64

In [41]:
lis.host_verifications

0                      ['email', 'phone']
1                      ['email', 'phone']
2                      ['email', 'phone']
3                      ['email', 'phone']
4        ['email', 'phone', 'work_email']
                       ...               
20904                           ['phone']
20905                  ['email', 'phone']
20906                  ['email', 'phone']
20907                  ['email', 'phone']
20908                  ['email', 'phone']
Name: host_verifications, Length: 20909, dtype: object

In [44]:
lis[lis.columns[-10:]]#.info()

Unnamed: 0,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,amenities_count,verification_email,verification_phone,verification_photographer,verification_work_email
0,1,1,0,0,0.11,56,1,1,0,0
1,1,1,0,0,0.24,46,1,1,0,0
2,1,1,0,0,0.14,16,1,1,0,0
3,2,1,1,0,0.49,43,1,1,0,0
4,1,1,0,0,0.02,35,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
20904,1,1,0,0,,6,0,1,0,0
20905,1,1,0,0,,5,1,1,0,0
20906,1,1,0,0,,6,1,1,0,0
20907,1,1,0,0,,29,1,1,0,0


In [26]:
lis[lis.columns[-20:]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20909 entries, 0 to 20908
Data columns (total 20 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   availability_365                              20909 non-null  int64         
 1   calendar_last_scraped                         20909 non-null  datetime64[ns]
 2   number_of_reviews                             20909 non-null  int64         
 3   number_of_reviews_ltm                         20909 non-null  int64         
 4   number_of_reviews_l30d                        20909 non-null  int64         
 5   first_review                                  17689 non-null  datetime64[ns]
 6   last_review                                   17689 non-null  datetime64[ns]
 7   review_scores_rating                          17689 non-null  float64       
 8   review_scores_accuracy                        17665 non-null  floa

In [25]:
lis.iloc[:10, 40:]

Unnamed: 0,minimum_nights_avg_ntm,maximum_nights_avg_ntm,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,3.0,10.0,True,0,0,0,0,2024-06-30,19,0,...,4.87,4.82,4.8,4.53,False,1,1,0,0,0.11
1,5.0,90.0,True,0,0,0,0,2024-06-30,36,2,...,4.94,4.94,4.89,4.53,False,1,1,0,0,0.24
2,4.0,10.0,True,0,4,32,32,2024-06-30,21,1,...,4.86,4.86,4.81,4.67,False,1,1,0,0,0.14
3,3.0,9.0,True,18,22,22,22,2024-06-30,82,3,...,4.88,4.91,4.89,4.71,False,2,1,1,0,0.49
4,7.0,60.0,True,10,17,45,45,2024-06-30,3,0,...,5.0,5.0,4.67,4.0,False,1,1,0,0,0.02
5,2.0,1125.0,False,0,0,0,0,2024-06-30,4,0,...,4.67,4.0,4.25,3.5,False,4,0,2,2,0.03
6,100.0,1125.0,True,0,0,0,0,2024-06-30,7,0,...,5.0,5.0,4.5,4.5,False,1,1,0,0,0.04
7,6.0,220.0,True,0,0,0,266,2024-06-30,17,0,...,4.35,4.65,4.71,4.24,False,1,1,0,0,0.12
8,3.0,14.0,True,1,1,1,1,2024-06-30,58,0,...,4.98,4.93,4.61,4.63,False,1,1,0,0,0.39
9,5.0,1125.0,True,0,7,20,20,2024-06-30,24,9,...,4.72,5.0,4.89,4.83,False,1,1,0,0,0.14


In [56]:
lis[lis.columns[10:20]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20909 entries, 0 to 20908
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   host_since            20908 non-null  datetime64[ns]
 1   host_location         17480 non-null  object        
 2   host_about            8866 non-null   object        
 3   host_response_time    14441 non-null  object        
 4   host_response_rate    14441 non-null  object        
 5   host_acceptance_rate  17161 non-null  object        
 6   host_is_superhost     20909 non-null  bool          
 7   host_thumbnail_url    20908 non-null  object        
 8   host_picture_url      20908 non-null  object        
 9   host_neighbourhood    5417 non-null   object        
dtypes: bool(1), datetime64[ns](1), object(8)
memory usage: 1.5+ MB


In [14]:
lis[lis.columns[10:20]]

Unnamed: 0,host_about,host_response_time,host_response_rate_pct,host_acceptance_rate_pct,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic
0,"Hi and welcome. My name is Ebbe, I am a medica...",,,,False,Vesterbro,1.0,1.0,"['email', 'phone']",True
1,Hej - I read the Master in Pedagogy. My husba...,,,1.0,False,Nørrebro,1.0,3.0,"['email', 'phone']",True
2,"Bla, bla, bla, bla",within a few hours,1.0,0.0,False,Nørrebro,1.0,1.0,"['email', 'phone']",True
3,As profession - Set and Costumedesigner for Av...,within an hour,1.0,1.0,True,Vesterbro,3.0,4.0,"['email', 'phone']",True
4,,within a day,0.9,0.0,False,Vesterbro,1.0,1.0,"['email', 'phone', 'work_email']",True
...,...,...,...,...,...,...,...,...,...,...
20904,,,,,False,,1.0,1.0,['phone'],True
20905,We're a 'young' (32 and 34) couple from Denmar...,,,,False,,1.0,1.0,"['email', 'phone']",False
20906,-,,,,False,,1.0,5.0,"['email', 'phone']",True
20907,"Hej, jeg er Razan Haugaard, en 51-årig selvstæ...",,,,False,,1.0,4.0,"['email', 'phone']",True


In [58]:
lis["host_response_rate"].str.rstrip("%").astype("float") / 100
lis["host_acceptance_rate"].str.rstrip("%").astype("float") / 100


0        NaN
1        NaN
2        1.0
3        1.0
4        0.9
        ... 
20904    NaN
20905    NaN
20906    NaN
20907    NaN
20908    NaN
Name: host_response_rate, Length: 20909, dtype: float64

In [None]:
print(cal.price.unique())#.nunique())

In [None]:
print(lis.beds.unique())

In [23]:
print(lis.amenities.str.strip('[]').str.split(',').str.len())

0        56
1        46
2        16
3        43
4        35
         ..
20904     6
20905     5
20906     6
20907    29
20908     2
Name: amenities, Length: 20909, dtype: int64


In [75]:
lis = lis.where(pd.notnull(lis), None)

## Checking currency inconsistency in cal and lis csvs

In [None]:
cal.loc[cal['listing_id'] == 262961]

Unnamed: 0,listing_id,date,available,price_USD,minimum_nights,maximum_nights
7203586,262961,2024-06-30,False,250.0,5.0,90.0
7203587,262961,2024-07-01,False,250.0,5.0,90.0
7203588,262961,2024-07-02,False,250.0,5.0,90.0
7203589,262961,2024-07-03,False,250.0,5.0,90.0
7203590,262961,2024-07-04,False,250.0,5.0,90.0
...,...,...,...,...,...,...
7203946,262961,2025-06-25,False,250.0,5.0,90.0
7203947,262961,2025-06-26,False,250.0,5.0,90.0
7203948,262961,2025-06-27,False,250.0,5.0,90.0
7203949,262961,2025-06-28,False,250.0,5.0,90.0


In [None]:
lis.loc[lis['id'] == 7631726, ['id', 'price_DKK']]

Unnamed: 0,id,price_DKK


In [None]:
lis[['id','listing_url', 'price_DKK']][:2]
# calendar_df[calendar_df['listing_id'] == 31094]

Unnamed: 0,id,listing_url,price_DKK
0,31094,https://www.airbnb.com/rooms/31094,
1,262961,https://www.airbnb.com/rooms/262961,1865.0


In [None]:
# Convert to datetime
rev['comments'] = rev['comments'].astype("string")

# Check both type and dtype since astype(str) was called but comments may still not be string dtype
print("Python type:", type(rev['comments']))
print("Pandas dtype:", rev['comments'].dtype)
# rev.info()
# Sanity check
# print("Calendar date range:", cal['datetime'].min(), "to", cal['datetime'].max())

Python type: <class 'pandas.core.series.Series'>
Pandas dtype: string


In [None]:
# Get columns with less than 3 unique values
low_unique_cols = [col for col in lis.columns if lis[col].nunique() < 3]
print("Columns with less than 3 unique values:")
for col in low_unique_cols:
    print(f"{col}: {lis[col].nunique()}")

In [None]:
# remap to boolean + sanity check
# print(lis.groupby('instant_bookable').size())
# print(lis[['instant_bookable']].groupby('instant_bookable').head(2))

# lis['instant_bookable'] = lis['instant_bookable'] == 't'

# print(lis.groupby('instant_bookable').size())
# print(lis[['instant_bookable']].groupby('instant_bookable').head(2))

# print(rev.reviewer_name)#.nunique())
rev.describe()