In [38]:
import pandas as pd
import numpy as np

#read csv
file_path = "Airbnb/listings.csv"  
df = pd.read_csv(file_path)

In [39]:
df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,264776,https://www.airbnb.com/rooms/264776,20250610032232,2025-06-11,city scrape,Huge Four Bedroom Apartment,An extremely large and sunny four bedroom grou...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/hosting/Hosti...,1389063,...,4.74,4.62,4.72,,t,11,11,0,0,0.51
1,264777,https://www.airbnb.com/rooms/264777,20250610032232,2025-06-11,city scrape,One Bedroom Apartment,Recently refurbished sunny one bedroom first f...,,https://a0.muscache.com/pictures/hosting/Hosti...,1389063,...,4.25,4.54,4.42,,t,11,11,0,0,0.22
2,264778,https://www.airbnb.com/rooms/264778,20250610032232,2025-06-11,city scrape,Two Bedroom Newly Refurbished Apartment,A large and sunny two bedroom second floor apa...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/50662093/af12...,1389063,...,4.52,4.36,4.38,,t,11,11,0,0,0.43
3,264779,https://www.airbnb.com/rooms/264779,20250610032232,2025-06-11,city scrape,Refurbished Two Bedroom Apartment,A large and sunny two bedroom second floor apa...,Catford is a well established London suburb. T...,https://a0.muscache.com/pictures/50660860/e440...,1389063,...,4.61,4.5,4.47,,t,11,11,0,0,0.3
4,264780,https://www.airbnb.com/rooms/264780,20250610032232,2025-06-11,city scrape,Spacious refurbished 2 bedroom apt with balcony,Completely refurbished 2 bedroom apt to sleep ...,,https://a0.muscache.com/pictures/airflow/Hosti...,1389063,...,4.74,4.37,4.59,,t,11,11,0,0,0.35


In [40]:
#select cols
cols_to_keep = [
    "id", "listing_url", "description", "host_id", "host_name", "host_is_superhost",
    "host_neighbourhood", "host_listings_count", "host_total_listings_count",
    "neighbourhood_cleansed", "latitude", "longitude", "property_type", "room_type",
    "accommodates", "bathrooms", "bathrooms_text", "bedrooms", "beds",
    "number_of_reviews_ltm", "estimated_occupancy_l365d", "first_review", "last_review",
    "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms",
    "reviews_per_month"
]

df = df[cols_to_keep]

In [41]:
df["last_review"].head(20)

0     2025-05-28
1     2024-12-11
2     2025-05-01
3     2025-04-10
4     2024-12-29
5     2023-08-12
6     2025-05-20
7     2025-05-04
8     2025-04-19
9     2025-05-15
10    2025-05-15
11    2025-04-15
12    2017-05-27
13    2017-03-01
14    2023-07-25
15    2024-12-16
16           NaN
17           NaN
18    2023-01-03
19    2024-09-24
Name: last_review, dtype: object

In [42]:
#check host_id

df = df.dropna(subset=["host_id"])

#convert to numeric
numeric_columns = [
    "latitude", "longitude", "accommodates", "bathrooms", "bedrooms", "beds",
    "host_listings_count", "host_total_listings_count",
    "number_of_reviews_ltm", "estimated_occupancy_l365d",
    "calculated_host_listings_count", "calculated_host_listings_count_entire_homes",
    "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms",
    "reviews_per_month"
]

In [43]:
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")  
        # 无法转换的变为 NaN

# check - illegal coord
df = df[(df["latitude"].between(-90, 90)) & (df["longitude"].between(-180, 180))]

In [44]:
# convert datetime + active list (last 12 months)

df["first_review"] = pd.to_datetime(df["first_review"], errors="coerce")

df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")

cutoff_date = pd.Timestamp.today() - pd.DateOffset(months=12)

active_list = df[df["last_review"] >= cutoff_date].copy()
active_list["last_review"].head()

0   2025-05-28
1   2024-12-11
2   2025-05-01
3   2025-04-10
4   2024-12-29
Name: last_review, dtype: datetime64[ns]

In [63]:
# no na coords
df["latitude"].head()
df_invalid_numeric = df[df["latitude"].isna() | df["longitude"].isna()]
df_invalid_numeric.head()

Unnamed: 0,id,listing_url,description,host_id,host_name,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,neighbourhood_cleansed,...,beds,number_of_reviews_ltm,estimated_occupancy_l365d,first_review,last_review,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month


In [67]:
for col in df.columns:
    if df[col].dtype in ["float64", "int64"]:
        df[col] = df[col].fillna(np.nan)
    else:
        df[col] = df[col].fillna("")

# any NaN/""
df_na = df[
    df.isna().any(axis=1) | (df.eq("").any(axis=1))
].copy()
# 
print("Na/Null/""：", len(df_na))
df_na.head()


Na/Null/： 76272


Unnamed: 0,id,listing_url,description,host_id,host_name,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,neighbourhood_cleansed,...,beds,number_of_reviews_ltm,estimated_occupancy_l365d,first_review,last_review,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
11,270600,https://www.airbnb.com/rooms/270600,This immaculate property is ideal for easy acc...,1416605,Emma,t,,3.0,3.0,Richmond upon Thames,...,1.0,7,64,2013-07-17,2025-04-15,3,2,1,0,0.61
12,271122,https://www.airbnb.com/rooms/271122,"This room is by far my favourite, it has a coz...",1419096,Claudia,f,LB of Hillingdon,4.0,4.0,Hillingdon,...,,0,0,2013-06-28,2017-05-27,4,1,3,0,0.05
13,425143,https://www.airbnb.com/rooms/425143,,2011372,Cagdas,f,Nine Elms,1.0,1.0,Wandsworth,...,,0,0,2012-07-03,2017-03-01,1,1,0,0,0.22
16,427229,https://www.airbnb.com/rooms/427229,,2087745,Tony,f,RB of Greenwich,2.0,2.0,Greenwich,...,,0,0,NaT,NaT,2,1,1,0,
17,427327,https://www.airbnb.com/rooms/427327,,2124088,Corrine,f,Shoreditch,3.0,7.0,Hackney,...,1.0,0,0,NaT,NaT,1,1,0,0,


In [46]:
df.head()

Unnamed: 0,id,listing_url,description,host_id,host_name,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,neighbourhood_cleansed,...,beds,number_of_reviews_ltm,estimated_occupancy_l365d,first_review,last_review,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,264776,https://www.airbnb.com/rooms/264776,An extremely large and sunny four bedroom grou...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,8.0,12,110,2014-07-03,2025-05-28,11,11,0,0,0.51
1,264777,https://www.airbnb.com/rooms/264777,Recently refurbished sunny one bedroom first f...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,2.0,4,37,2016-07-21,2024-12-11,11,11,0,0,0.22
2,264778,https://www.airbnb.com/rooms/264778,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,3.0,6,55,2014-04-21,2025-05-01,11,11,0,0,0.43
3,264779,https://www.airbnb.com/rooms/264779,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,5.0,7,64,2015-07-23,2025-04-10,11,11,0,0,0.3
4,264780,https://www.airbnb.com/rooms/264780,Completely refurbished 2 bedroom apt to sleep ...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,4.0,4,37,2012-09-18,2024-12-29,11,11,0,0,0.35


In [47]:
# select room type & entire home/apt

unique_room_types = df["room_type"].unique()
print("room_type：")
print(unique_room_types)

room_type：
['Entire home/apt' 'Private room' 'Hotel room' 'Shared room']


In [48]:
# export
output_path = "Airbnb/cleaned_listing_all.csv" 
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print("success：", output_path)


success： Airbnb/cleaned_listing_all.csv


In [64]:
entire_home = df[df["room_type"] == "Entire home/apt"].copy()

print("number of Entire home：", len(entire_home))


entire_home.to_csv("Airbnb/entire_home_listings.csv", index=False)

number of Entire home： 62456


In [49]:

# dataset without description

df_drop_des = df.drop(columns=["description"])
output_path = "Airbnb/cleaned_drop_des_list.csv" 
df_drop_des.to_csv(output_path, index=False, encoding="utf-8-sig")

print("success：", output_path)


success： Airbnb/cleaned_drop_des_list.csv


In [52]:
# dataset active list (12 months)

cutoff_date = pd.Timestamp.today() - pd.DateOffset(months=12)

active_list = df[df["last_review"] >= cutoff_date].copy()

print("number of active_list：", len(active_list))
active_list.head()

number of active_list： 40519


Unnamed: 0,id,listing_url,description,host_id,host_name,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,neighbourhood_cleansed,...,beds,number_of_reviews_ltm,estimated_occupancy_l365d,first_review,last_review,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,264776,https://www.airbnb.com/rooms/264776,An extremely large and sunny four bedroom grou...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,8.0,12,110,2014-07-03,2025-05-28,11,11,0,0,0.51
1,264777,https://www.airbnb.com/rooms/264777,Recently refurbished sunny one bedroom first f...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,2.0,4,37,2016-07-21,2024-12-11,11,11,0,0,0.22
2,264778,https://www.airbnb.com/rooms/264778,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,3.0,6,55,2014-04-21,2025-05-01,11,11,0,0,0.43
3,264779,https://www.airbnb.com/rooms/264779,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,5.0,7,64,2015-07-23,2025-04-10,11,11,0,0,0.3
4,264780,https://www.airbnb.com/rooms/264780,Completely refurbished 2 bedroom apt to sleep ...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,4.0,4,37,2012-09-18,2024-12-29,11,11,0,0,0.35


In [53]:
df["last_review"].head(20)

0    2025-05-28
1    2024-12-11
2    2025-05-01
3    2025-04-10
4    2024-12-29
5    2023-08-12
6    2025-05-20
7    2025-05-04
8    2025-04-19
9    2025-05-15
10   2025-05-15
11   2025-04-15
12   2017-05-27
13   2017-03-01
14   2023-07-25
15   2024-12-16
16          NaT
17          NaT
18   2023-01-03
19   2024-09-24
Name: last_review, dtype: datetime64[ns]

In [55]:
output_path = "Airbnb/active_listing_all.csv" 
active_list.to_csv(output_path, index=False, encoding="utf-8-sig")

print("success：", output_path)


success： Airbnb/active_listing_all.csv


In [57]:
active_list.head(20)

Unnamed: 0,id,listing_url,description,host_id,host_name,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,neighbourhood_cleansed,...,beds,number_of_reviews_ltm,estimated_occupancy_l365d,first_review,last_review,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,264776,https://www.airbnb.com/rooms/264776,An extremely large and sunny four bedroom grou...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,8.0,12,110,2014-07-03,2025-05-28,11,11,0,0,0.51
1,264777,https://www.airbnb.com/rooms/264777,Recently refurbished sunny one bedroom first f...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,2.0,4,37,2016-07-21,2024-12-11,11,11,0,0,0.22
2,264778,https://www.airbnb.com/rooms/264778,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,3.0,6,55,2014-04-21,2025-05-01,11,11,0,0,0.43
3,264779,https://www.airbnb.com/rooms/264779,A large and sunny two bedroom second floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,5.0,7,64,2015-07-23,2025-04-10,11,11,0,0,0.3
4,264780,https://www.airbnb.com/rooms/264780,Completely refurbished 2 bedroom apt to sleep ...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,4.0,4,37,2012-09-18,2024-12-29,11,11,0,0,0.35
6,264782,https://www.airbnb.com/rooms/264782,A large and sunny one bedroom ground floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,3.0,4,37,2018-06-30,2025-05-20,11,11,0,0,0.2
7,264783,https://www.airbnb.com/rooms/264783,A large and sunny four bedroom ground floor ap...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,6.0,4,37,2013-10-07,2025-05-04,11,11,0,0,0.31
8,264789,https://www.airbnb.com/rooms/264789,A large and sunny four bedroom first floor apa...,1389063,Sue,f,Bellingham,11.0,12.0,Lewisham,...,4.0,12,110,2015-02-15,2025-04-19,11,11,0,0,0.52
9,266037,https://www.airbnb.com/rooms/266037,We've welcomed over 500 guests and received ex...,1395281,James,t,St. Pancras,1.0,3.0,Camden,...,1.0,18,252,2011-11-20,2025-05-15,1,0,1,0,3.22
10,268398,https://www.airbnb.com/rooms/268398,"This is a clean and private room, with your ow...",979363,Donovan,t,South Bank,6.0,7.0,Lambeth,...,2.0,4,37,2011-12-09,2025-05-15,6,1,5,0,3.42
