In [2]:
import pandas as pd

In [3]:
# Read the CSV files
cal = pd.read_csv('data/calendar2024.csv')
lis = pd.read_csv('data/listings2024.csv') 
rev = pd.read_csv('data/reviews2024.csv')

## TODO
- [ ] Many variables are stored as generic objects, instead of their actual datatype
- [ ] datetime columns are stored as objects
- [ ] lis.neighbourhood_cleansed has spelling mistakes
- [ ] first name of reviewer/host seems irrelevant, when unique-ID is available
- [ ] NaN, missing, and null values are inconsistent
- [ ] Boolean is expressed as t/f
- [ ] Price is stored as a string in an arbitrary currency (is it $ or local?)

**lis csv** 
- property_type, and room_type are mapped as obj., is categorial
- bathroom and bathrooms_text are the same, but bathrooms_text has some values that are not in bathroom.
- bedrooms, and beds	are stored as float, but is int.

- [ ] ETL: Extract, Transform, Load

---
> Attributes of datasets identified at initial look
- Primary keys: cal.listing_id, list.id, rev.listing_id


### Clean

In [4]:
cal.drop(columns=['adjusted_price'], inplace=True)

null_cols_lis = lis.columns[lis.isna().all()].tolist()
lis = lis.drop(columns=null_cols_lis)

lis.drop(columns=['scrape_id'], inplace=True)

lis.drop(columns=['host_name'], inplace=True)
rev.drop(columns=['reviewer_name'], inplace=True)

In [5]:
# cal['price'] = cal['price'].str.replace(r'[\$,]', '', regex=True)

# cal = cal.rename(columns={'price': 'price($)'})
# cal['price($)'] = pd.to_numeric(cal['price($)'], errors='coerce')

cal['available'] = cal['available'] == 't'
lis['instant_bookable'] = lis['instant_bookable'] == 't'
lis['host_is_superhost'] = lis['host_is_superhost'] == 't'
lis['host_has_profile_pic'] = lis['host_has_profile_pic'] == 't'
lis['host_identity_verified'] = lis['host_identity_verified'] == 't'
lis['has_availability'] = lis['has_availability'] == 't'


cal['date'] = pd.to_datetime(cal['date'])
rev['date'] = pd.to_datetime(rev['date'])
lis['last_scraped'] = pd.to_datetime(lis['last_scraped'])
lis['host_since'] = pd.to_datetime(lis['host_since'])

rev['comments'].astype("string")#(str)

0         We had a great stay. Conveniently located, qui...
1         It was a very good stay. The appartment was re...
2         Really enjoyed my time at Ebbe's place.  It is...
3         The apartment was very well located, 10-15 min...
4         This is a great flat, very clean with everythi...
                                ...                        
366631                             Great apt in Copenhagen!
366632    I recently had the pleasure of staying at Emil...
366633                                                    👍
366634    Laura has been a great host, very nice and wel...
366635    Ellen Sophie's apartment is above all expectat...
Name: comments, Length: 366636, dtype: string

In [8]:
lis[:5]
# calendar_df[calendar_df['listing_id'] == 31094]

Unnamed: 0,id,listing_url,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,host_url,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,31094,https://www.airbnb.com/rooms/31094,2024-06-30,previous scrape,"Beautiful, spacious, central, renovated Penthouse","Welcome to our home, we hope you will enjoy Wo...","What else is nearby?<br />To be honest, We thi...",https://a0.muscache.com/pictures/miso/Hosting-...,129976,https://www.airbnb.com/users/show/129976,...,4.87,4.82,4.8,4.53,False,1,1,0,0,0.11
1,262961,https://www.airbnb.com/rooms/262961,2024-06-30,city scrape,192m2 FLAT+ 8m2 BALCONY IN CENTER NØRREBRO HOOD,If you are looking for a large apartment in ce...,TIME OUT - The world’s coolest neighbourhoods ...,https://a0.muscache.com/pictures/261aa506-7b13...,1379904,https://www.airbnb.com/users/show/1379904,...,4.94,4.94,4.89,4.53,False,1,1,0,0,0.24
2,263036,https://www.airbnb.com/rooms/263036,2024-06-30,city scrape,Bright flat in central location,,,https://a0.muscache.com/pictures/17770169/f1fb...,1232471,https://www.airbnb.com/users/show/1232471,...,4.86,4.86,4.81,4.67,False,1,1,0,0,0.14
3,32379,https://www.airbnb.com/rooms/32379,2024-06-30,city scrape,"155 m2 artist flat on Vesterbro, with 2 bathrooms",You enter a narrow entrance and feel the good ...,"Værnedamsvej area is super hip area, we call i...",https://a0.muscache.com/pictures/miso/Hosting-...,140105,https://www.airbnb.com/users/show/140105,...,4.88,4.91,4.89,4.71,False,2,1,1,0,0.49
4,263708,https://www.airbnb.com/rooms/263708,2024-06-30,city scrape,Urban garden on Vesterbro rooftop,,,https://a0.muscache.com/pictures/8ce36f40-ac59...,1383888,https://www.airbnb.com/users/show/1383888,...,5.0,5.0,4.67,4.0,False,1,1,0,0,0.02


In [10]:
print("Listings shape:", lis.shape)
print("Calendar shape:", cal.shape) 
print("Reviews shape:", rev.shape)

Listings shape: (20909, 70)
Calendar shape: (7631731, 6)
Reviews shape: (366636, 5)


In [7]:
# Convert to datetime
rev['comments'] = rev['comments'].astype("string")

# Check both type and dtype since astype(str) was called but comments may still not be string dtype
print("Python type:", type(rev['comments']))
print("Pandas dtype:", rev['comments'].dtype)
# rev.info()
# Sanity check
# print("Calendar date range:", cal['datetime'].min(), "to", cal['datetime'].max())

Python type: <class 'pandas.core.series.Series'>
Pandas dtype: string


In [None]:
print(cal.price.unique())#.nunique())

In [None]:
print(lis.beds.unique())

In [None]:
print(lis.instant_bookable)

In [None]:
lis[lis.columns[10:20]].info()

In [75]:
lis = lis.where(pd.notnull(lis), None)

In [None]:
lis[lis.columns[:-20]].info()

In [None]:
lis.iloc[:2, :-20]

In [None]:
# Get columns with less than 3 unique values
low_unique_cols = [col for col in lis.columns if lis[col].nunique() < 3]
print("Columns with less than 3 unique values:")
for col in low_unique_cols:
    print(f"{col}: {lis[col].nunique()}")

In [None]:
# remap to boolean + sanity check
# print(lis.groupby('instant_bookable').size())
# print(lis[['instant_bookable']].groupby('instant_bookable').head(2))

# lis['instant_bookable'] = lis['instant_bookable'] == 't'

# print(lis.groupby('instant_bookable').size())
# print(lis[['instant_bookable']].groupby('instant_bookable').head(2))

# print(rev.reviewer_name)#.nunique())
rev.describe()