# Airbnb Property Listing - Data Preparation

## Library Imports

In [2]:
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile
from ast import literal_eval

## Download the data

In [4]:
if not os.path.exists('./airbnb-property-listings.zip'):
    !wget "https://aicore-project-files.s3.eu-west-1.amazonaws.com/airbnb-property-listings.zip"

--2022-10-28 14:54:36--  https://aicore-project-files.s3.eu-west-1.amazonaws.com/airbnb-property-listings.zip
Resolving aicore-project-files.s3.eu-west-1.amazonaws.com (aicore-project-files.s3.eu-west-1.amazonaws.com)... 52.92.18.106, 52.218.98.56, 52.218.116.90, ...
Connecting to aicore-project-files.s3.eu-west-1.amazonaws.com (aicore-project-files.s3.eu-west-1.amazonaws.com)|52.92.18.106|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 356510420 (340M) [application/zip]
Saving to: ‘airbnb-property-listings.zip’


2022-10-28 14:55:00 (14.7 MB/s) - ‘airbnb-property-listings.zip’ saved [356510420/356510420]



## Data Exploration

In [3]:
with ZipFile('./airbnb-property-listings.zip') as myzip:
    data = myzip.open("AirbnbDataSci/tabular_data/AirBnbData.csv")

data_df = pd.read_csv(data)
data_df.head()

Unnamed: 0,ID,Category,Title,Description,Amenities,Location,guests,beds,bathrooms,Price_Night,Cleanliness_rate,Accuracy_rate,Communication_rate,Location_rate,Check-in_rate,Value_rate,amenities_count,url,bedrooms
0,f9dcbd09-32ac-41d9-a0b1-fdb2793378cf,Treehouses,Red Kite Tree Tent - Ynys Affalon,"['About this space', ""Escape to one of these t...","['What this place offers', 'Bathroom', 'Shampo...",Llandrindod Wells United Kingdom,2.0,1.0,1.0,105.0,4.6,4.7,4.3,5.0,4.3,4.3,13.0,https://www.airbnb.co.uk/rooms/26620994?adults...,
1,1b4736a7-e73e-45bc-a9b5-d3e7fcf652fd,Treehouses,Az Alom Cabin - Treehouse Tree to Nature Cabin,"['About this space', ""Come and spend a romanti...","['What this place offers', 'Bedroom and laundr...",Guyonvelle Grand Est France,3.0,3.0,0.0,92.0,4.3,4.7,4.6,4.9,4.7,4.5,8.0,https://www.airbnb.co.uk/rooms/27055498?adults...,1.0
2,d577bc30-2222-4bef-a35e-a9825642aec4,Treehouses,Cabane Entre Les Pins\n🌲🏕️🌲,"['About this space', 'Rustic cabin between the...","['What this place offers', 'Scenic views', 'Ga...",Duclair Normandie France,4.0,2.0,1.5,52.0,4.2,4.6,4.8,4.8,4.8,4.7,51.0,https://www.airbnb.co.uk/rooms/51427108?adults...,1.0
3,ca9cbfd4-7798-4e8d-8c17-d5a64fba0abc,Treehouses,Tree Top Cabin with log burner & private hot tub,"['About this space', 'The Tree top cabin is si...","['What this place offers', 'Bathroom', 'Hot wa...",Barmouth Wales United Kingdom,2.0,,1.0,132.0,4.8,4.9,4.9,4.9,5.0,4.6,23.0,https://www.airbnb.co.uk/rooms/49543851?adults...,
4,8b2d0f78-16d8-4559-8692-62ebce2a1302,Treehouses,Hanging cabin,"['About this space', 'Feel refreshed at this u...","['What this place offers', 'Heating and coolin...",Wargnies-le-Petit Hauts-de-France France,2.0,1.0,,111.0,,,,,,,5.0,https://www.airbnb.co.uk/rooms/50166553?adults...,1.0


In [4]:
data_df.shape

(988, 19)

In [77]:
def tweak_data(df):
    return (df.rename(columns=dict(zip(df.columns, df.columns.str.lower())))
                .assign(category = lambda df_: df_.category.astype('category'),
                        description = lambda df_: df_.description.str.replace("\'\', ", '')
                                                        .apply(lambda x: ' '.join(literal_eval(x)[1:]) if str(x) != 'nan' else x))
                .dropna(axis=0, subset=['cleanliness_rate', 'description'])
                .assign(guests = lambda df_: df_.guests.fillna(1),
                        beds = lambda df_: df_.beds.fillna(1),
                        bathrooms = lambda df_: df_.bathrooms.fillna(1),
                        bedrooms = lambda df_: df_.bedrooms.fillna(1))
    )
tweaked_df = tweak_data(data_df)
tweaked_df.head()

Unnamed: 0,id,category,title,description,amenities,location,guests,beds,bathrooms,price_night,cleanliness_rate,accuracy_rate,communication_rate,location_rate,check-in_rate,value_rate,amenities_count,url,bedrooms
0,f9dcbd09-32ac-41d9-a0b1-fdb2793378cf,Treehouses,Red Kite Tree Tent - Ynys Affalon,Escape to one of these two fabulous Tree Tents...,"['What this place offers', 'Bathroom', 'Shampo...",Llandrindod Wells United Kingdom,2.0,1.0,1.0,105.0,4.6,4.7,4.3,5.0,4.3,4.3,13.0,https://www.airbnb.co.uk/rooms/26620994?adults...,1.0
1,1b4736a7-e73e-45bc-a9b5-d3e7fcf652fd,Treehouses,Az Alom Cabin - Treehouse Tree to Nature Cabin,Come and spend a romantic stay with a couple o...,"['What this place offers', 'Bedroom and laundr...",Guyonvelle Grand Est France,3.0,3.0,0.0,92.0,4.3,4.7,4.6,4.9,4.7,4.5,8.0,https://www.airbnb.co.uk/rooms/27055498?adults...,1.0
2,d577bc30-2222-4bef-a35e-a9825642aec4,Treehouses,Cabane Entre Les Pins\n🌲🏕️🌲,"Rustic cabin between the pines, 3 meters high ...","['What this place offers', 'Scenic views', 'Ga...",Duclair Normandie France,4.0,2.0,1.5,52.0,4.2,4.6,4.8,4.8,4.8,4.7,51.0,https://www.airbnb.co.uk/rooms/51427108?adults...,1.0
3,ca9cbfd4-7798-4e8d-8c17-d5a64fba0abc,Treehouses,Tree Top Cabin with log burner & private hot tub,The Tree top cabin is situated in our peaceful...,"['What this place offers', 'Bathroom', 'Hot wa...",Barmouth Wales United Kingdom,2.0,1.0,1.0,132.0,4.8,4.9,4.9,4.9,5.0,4.6,23.0,https://www.airbnb.co.uk/rooms/49543851?adults...,1.0
5,cfe479b9-c8f8-44af-9bc6-46ede9f14bb5,Treehouses,Treehouse near Paris Disney,"Charming cabin nestled in the leaves, real unu...","['What this place offers', 'Bathroom', 'Hair d...",Le Plessis-Feu-Aussoux Île-de-France France,4.0,3.0,1.0,143.0,5.0,4.9,5.0,4.7,5.0,4.7,32.0,https://www.airbnb.co.uk/rooms/935398?adults=1...,2.0


In [78]:
tweaked_df.category.value_counts()

Chalets          192
Treehouses       183
Amazing pools    166
Offbeat          165
Beachfront       124
Name: category, dtype: int64

In [79]:
# Some repeated title's which is a bit strange
tweaked_df.title.value_counts()[tweaked_df.title.value_counts() > 1]

The Pool House                                                 5
The best in tiny living!\nTreehouseTopia                       2
The Beach House                                                2
Treehouse                                                      2
Secluded Oak Barn Retreat with Hot Tub & Pool!                 2
Countryside retreat, swimming pool, stunning views             2
Cliff Dweller:  Spend a night Suspended from the Ridgeline!    2
Name: title, dtype: int64

In [80]:
# description is a strange looking string
tweaked_df.description[0]

"Escape to one of these two fabulous Tree Tents. Suspended high above the canopy, it’s time to appreciate life from a new perspective. Featured on George Clarke’s Amazing Spaces, these Tree Tents are a feat of aviation technology. Tree Tent comes complete with fire pit, outdoor kitchen and shower with hot water. You’ll discover a comfortable bed and cosy wood burning stove. Part of the Red Kite Estate, along with our barn and its sister tree tent, the first ever built in the UK, Dragon's Egg. The space The space\nThe true joy of this place is how wonderfully simple it is (aviation technology aside). Days are filled with fireside discussions, wildlife watching and stunningly beautiful walks. With the nearest mobile signal a ten minute walk away, it’s a great place to ditch the digital and truly escape. Head over the bridge to your own private deck that happily houses a clever outdoor-kitchen and shower (complete with hot water). It’s the perfect spot to fry up breakfast whilst basking i

In [81]:
# So is amenities
tweaked_df.amenities[0]

"['What this place offers', 'Bathroom', 'Shampoo', 'Bedroom and laundry', 'Essentials', 'Towels, bed sheets, soap and toilet paper', 'Heating and cooling', 'Indoor fireplace', 'Heating', 'Home safety', 'Smoke alarm', 'Carbon monoxide alarm', 'Fire extinguisher', 'First aid kit', 'Kitchen and dining', 'Kitchen', 'Space where guests can cook their own meals', 'Cooking basics', 'Pots and pans, oil, salt and pepper', 'Location features', 'Private entrance', 'Separate street or building entrance', 'Parking and facilities', 'Free parking on premises', 'Services', 'Long-term stays allowed', 'Allow stays of 28 days or more', 'Not included', 'Unavailable: Security cameras on property\\nSecurity cameras on property', 'Unavailable: Wifi\\nWifi', 'Unavailable: TV\\nTV', 'Unavailable: Washing machine\\nWashing machine', 'Unavailable: Air conditioning\\nAir conditioning', 'Unavailable: Hair dryer\\nHair dryer']"

In [82]:
tweaked_df.isna().sum()[tweaked_df.isna().sum() > 0]

Series([], dtype: int64)

In [83]:
tweaked_df.describe()

Unnamed: 0,guests,beds,bathrooms,price_night,cleanliness_rate,accuracy_rate,communication_rate,location_rate,check-in_rate,value_rate,amenities_count,bedrooms
count,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0,830.0
mean,3.914458,2.422892,1.375301,154.172289,4.853735,4.906024,4.933614,4.904819,4.944578,4.769036,35.501205,1.649398
std,2.377246,1.841279,0.824712,129.082913,0.182012,0.129999,0.121769,0.120717,0.103908,0.174952,14.221304,1.060859
min,1.0,1.0,0.0,8.0,3.8,4.0,3.9,4.0,3.9,3.7,3.0,1.0
25%,2.0,1.0,1.0,83.25,4.8,4.9,4.9,4.9,4.9,4.7,25.0,1.0
50%,4.0,2.0,1.0,120.0,4.9,4.9,5.0,4.9,5.0,4.8,35.0,1.0
75%,5.0,3.0,1.5,176.75,5.0,5.0,5.0,5.0,5.0,4.9,44.0,2.0
max,16.0,17.0,10.0,1132.0,5.0,5.0,5.0,5.0,5.0,5.0,84.0,10.0


In [88]:
data_df.Cleanliness_rate.isna().sum()

98

In [87]:
# Ok so if any rating column is missing, they are all missing
ratings_columns = ['Cleanliness_rate', 'Accuracy_rate', 'Communication_rate', 'Location_rate', 'Check-in_rate', 'Value_rate']
isna_idxs = set()
for col in ratings_columns:
    idxs = data_df[data_df[col].isna()].index.to_list()
    isna_idxs.update(idxs)

len(isna_idxs)

98