In [2]:
import pandas as pd

In [47]:
# Read the CSV files
cal = pd.read_csv('data/calendar2024.csv')
lis = pd.read_csv('data/listings2024.csv') 
rev = pd.read_csv('data/reviews2024.csv')


## TODO
- [ ] Many variables are stored as generic objects, instead of their actual datatype
- [ ] ETL: Extract, Transform, Load

---
> Attributes of datasets identified at initial look
- Primary keys: cal.listing_id, list.id, rev.listing_id


### Clean

In [None]:
cal.drop(columns=['adjusted_price'], inplace=True)

In [24]:
cal['price'] = cal['price'].str.replace(r'[\$,]', '', regex=True)
cal = cal.rename(columns={'price': 'price($)'})
cal['price($)'] = pd.to_numeric(cal['price($)'], errors='coerce')

In [54]:
lis.isna().all()

id                                              False
listing_url                                     False
scrape_id                                       False
last_scraped                                    False
source                                          False
                                                ...  
calculated_host_listings_count                  False
calculated_host_listings_count_entire_homes     False
calculated_host_listings_count_private_rooms    False
calculated_host_listings_count_shared_rooms     False
reviews_per_month                               False
Length: 75, dtype: bool

In [52]:
# cal.columns[cal.isna().all()]#.tolist()
lis.columns[lis.isna().all()]#.tolist()
# rev.columns[rev.isna().all()]#.tolist()

Index(['neighbourhood_group_cleansed', 'calendar_updated', 'license'], dtype='object')

In [44]:
# Check which columns would be dropped
null_cols_cal = cal.columns[cal.isna().all()].tolist()
null_cols_lis = lis.columns[lis.isna().all()].tolist()
null_cols_rev = rev.columns[rev.isna().all()].tolist()

# Print for documentation
print("Dropping completely null columns:")
print(f"Calendar: {null_cols_cal}")
print(f"Listings: {null_cols_lis}")
print(f"Reviews: {null_cols_rev}")

# Drop the columns
cal = cal.drop(columns=null_cols_cal)
lis = lis.drop(columns=null_cols_lis)
rev = rev.drop(columns=null_cols_rev)

Dropping completely null columns:
Calendar: []
Listings: ['neighbourhood_group_cleansed', 'calendar_updated', 'license']
Reviews: []


In [None]:
# Drop columns with all null values
cal = cal.dropna(axis=1, how='all')
lis = lis.dropna(axis=1, how='all') 
rev = rev.dropna(axis=1, how='all')


In [36]:
# Get columns with less than 10 unique values
low_unique_cols = [col for col in lis.columns if lis[col].nunique() < 5]
print("Columns with less than 10 unique values:")
for col in low_unique_cols:
    print(f"{col}: {lis[col].nunique()}")

# lis.nunique() if lis.nunique()< 10

Columns with less than 10 unique values:
scrape_id: 1
last_scraped: 2
source: 2
host_response_time: 4
host_is_superhost: 2
host_has_profile_pic: 2
host_identity_verified: 2
neighbourhood_group_cleansed: 0
room_type: 4
calendar_updated: 0
has_availability: 2
calendar_last_scraped: 2
license: 0
instant_bookable: 2
calculated_host_listings_count_shared_rooms: 4


In [5]:
cal[:5]
# calendar_df[calendar_df['listing_id'] == 31094]

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,586421,2024-06-30,f,$250.00,,4.0,31.0
1,586421,2024-07-01,f,$250.00,,4.0,31.0
2,586421,2024-07-02,t,$250.00,,4.0,31.0
3,586421,2024-07-03,f,$250.00,,4.0,31.0
4,586421,2024-07-04,f,$250.00,,4.0,31.0


In [43]:
print(lis.neighbourhood_group_cleansed)#.nunique())

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
20904   NaN
20905   NaN
20906   NaN
20907   NaN
20908   NaN
Name: neighbourhood_group_cleansed, Length: 20909, dtype: float64


In [8]:
reviews_df[:5]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,31094,79346,2010-08-16,171607,Ben,"We had a great stay. Conveniently located, qui..."
1,31094,166275,2011-01-05,306860,Makita,It was a very good stay. The appartment was re...
2,31094,1452299,2012-06-10,1321058,Pierre,Really enjoyed my time at Ebbe's place. It is...
3,31094,6766430,2013-08-24,2182771,Sussie,"The apartment was very well located, 10-15 min..."
4,31094,6827217,2013-08-26,8025926,Wil,"This is a great flat, very clean with everythi..."


In [12]:
lis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20909 entries, 0 to 20908
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            20909 non-null  int64  
 1   listing_url                                   20909 non-null  object 
 2   scrape_id                                     20909 non-null  int64  
 3   last_scraped                                  20909 non-null  object 
 4   source                                        20909 non-null  object 
 5   name                                          20909 non-null  object 
 6   description                                   20235 non-null  object 
 7   neighborhood_overview                         8984 non-null   object 
 8   picture_url                                   20909 non-null  object 
 9   host_id                                       20909 non-null 

In [22]:
print(reviews_df.reviewer_name)

0               Ben
1            Makita
2            Pierre
3            Sussie
4               Wil
            ...    
366631      Killian
366632    Sreeremya
366633         Knut
366634      Cyprien
366635      Camille
Name: reviewer_name, Length: 366636, dtype: object


In [None]:
lis.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,20909.0,20909.0,20909.0,20908.0,20908.0,0.0,20909.0,20909.0,20909.0,13660.0,...,17665.0,17665.0,17664.0,17664.0,0.0,20909.0,20909.0,20909.0,20909.0,17689.0
mean,5.137813e+17,20240630000000.0,144269000.0,5.942606,12.671992,,55.680567,12.55855,3.336315,1.10399,...,4.880881,4.917725,4.840928,4.724537,,4.659572,4.458415,0.195849,0.004209,0.776318
std,4.737075e+17,0.0,168721400.0,32.713862,84.689161,,0.019127,0.031304,1.644905,0.326999,...,0.219335,0.191054,0.233053,0.308463,,26.246421,26.226017,0.865031,0.098688,1.124821
min,31094.0,20240630000000.0,513.0,1.0,1.0,,55.61566,12.454,1.0,0.0,...,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.01
25%,31404500.0,20240630000000.0,17396510.0,1.0,1.0,,55.66628,12.54073,2.0,1.0,...,4.83,4.9,4.76,4.62,,1.0,1.0,0.0,0.0,0.19
50%,6.452861e+17,20240630000000.0,64311410.0,1.0,1.0,,55.68193,12.55517,3.0,1.0,...,4.97,5.0,4.92,4.78,,1.0,1.0,0.0,0.0,0.42
75%,9.441256e+17,20240630000000.0,209703000.0,1.0,2.0,,55.696041,12.580508,4.0,1.0,...,5.0,5.0,5.0,5.0,,1.0,1.0,0.0,0.0,0.93
max,1.189116e+18,20240630000000.0,586235800.0,667.0,1972.0,,55.73247,12.63972,16.0,8.0,...,5.0,5.0,5.0,5.0,,239.0,239.0,16.0,3.0,42.5
