In [30]:
import pandas as pd
import re
import ast
from collections import Counter


In [20]:
_path = r"C:\Users\hodos\Documents\Uni\Uni-Year-3\Semester2\Data\cleaned_listings_amsterdam.csv"

In [18]:
def load_csv(path):
    """Load CSV file from the given path."""
    return pd.read_csv(path)

In [21]:
data = load_csv(_path)
df = pd.DataFrame(data)


<class 'str'>


In [31]:
# -------------------
# 1. Clean text
# -------------------
def clean_amenity(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()
    return text



df["amenities"] = df["amenities"].apply(ast.literal_eval)
df["cleaned_amenities"] = df["amenities"].apply(lambda lst: [clean_amenity(a) for a in lst])



In [32]:
df["cleaned_amenities"][0]

['coffee maker nespresso',
 'shampoo',
 'paid street parking off premises',
 'essentials',
 'waterfront',
 'shower gel',
 'mini fridge',
 'city skyline view',
 'fast wifi 245 mbps',
 'private entrance',
 'tv with standard cable',
 'luggage dropoff allowed',
 'outdoor furniture',
 'long term stays allowed',
 'self checkin',
 'private patio or balcony',
 'outdoor dining area',
 'dedicated workspace',
 'smoke alarm',
 'hp neutral eco friendly body soap',
 'refrigerator',
 'wine glasses',
 'breakfast',
 'harbor view',
 'canal view',
 'private backyard not fully fenced',
 'boat slip',
 'hot water',
 'garden view',
 'clothing storage closet',
 'books and reading material',
 'bed linens',
 'hangers',
 'smart lock',
 'portable fans',
 'central heating',
 'carbon monoxide alarm',
 'lake access',
 'hair dryer',
 'extra pillows and blankets',
 'private living room',
 'safe',
 'fire extinguisher',
 'laundromat nearby',
 'heating split type ductless system',
 'bikes',
 'air conditioning',
 'coffee'

In [33]:
# -------------------
# 2. Find most frequent words
# -------------------
word_counter = Counter()

for lst in df["cleaned_amenities"]:
    for amenity in lst:
        words = amenity.split()
        word_counter.update(words)

# Show the top N frequent words (skip common boring words)
skip_words = {"with", "and", "the", "in", "on", "of", "for", "to", "inch", "by"}
top_words = [(w, c) for w, c in word_counter.items() if w not in skip_words]
top_words = sorted(top_words, key=lambda x: x[1], reverse=True)

print("Top words found in amenities:")
for w, c in top_words[:20]:
    print(f"{w}: {c}")

Top words found in amenities:
alarm: 15211
hot: 13585
water: 13398
coffee: 11952
dryer: 11455
wifi: 10432
private: 9304
smoke: 9220
heating: 8997
kitchen: 8703
maker: 8175
dining: 8022
clothing: 7887
essentials: 7794
washer: 7717
dishes: 7573
silverware: 7572
refrigerator: 7469
bed: 7295
linens: 7295


In [34]:
# -------------------
# 3. Auto-generate mapping skeleton
# -------------------
# Here we assume each word itself is a candidate keyword for standardization
auto_map = {word: f"has_{word.replace(' ', '_')}" for word, _ in top_words[:20]}

print("\nAuto-generated mapping:")
for k, v in auto_map.items():
    print(f"'{k}': '{v}'")


Auto-generated mapping:
'alarm': 'has_alarm'
'hot': 'has_hot'
'water': 'has_water'
'coffee': 'has_coffee'
'dryer': 'has_dryer'
'wifi': 'has_wifi'
'private': 'has_private'
'smoke': 'has_smoke'
'heating': 'has_heating'
'kitchen': 'has_kitchen'
'maker': 'has_maker'
'dining': 'has_dining'
'clothing': 'has_clothing'
'essentials': 'has_essentials'
'washer': 'has_washer'
'dishes': 'has_dishes'
'silverware': 'has_silverware'
'refrigerator': 'has_refrigerator'
'bed': 'has_bed'
'linens': 'has_linens'


In [35]:
# -------------------
# 4. Apply mapping to standardize amenities
# -------------------
def map_auto(amenity_list, keyword_map):
    standardized = set()
    for amenity in amenity_list:
        for keyword, standard_name in keyword_map.items():
            if keyword in amenity:
                standardized.add(standard_name)
    return list(standardized)


In [39]:
df["standardized_amenities"] = df["cleaned_amenities"].apply(lambda lst: map_auto(lst, auto_map))

# -------------------
# 5. Create binary columns
# -------------------
all_standardized = sorted({amen for lst in df["standardized_amenities"] for amen in lst})

for amenity in all_standardized:
    df[amenity] = df["standardized_amenities"].apply(lambda lst: int(amenity in lst))

print("\nFinal DataFrame:")
print(df.head)


Final DataFrame:
<bound method NDFrame.head of                         id                                       listing_url  \
0                    27886                https://www.airbnb.com/rooms/27886   
1                    28871                https://www.airbnb.com/rooms/28871   
2                    29051                https://www.airbnb.com/rooms/29051   
3                    44391                https://www.airbnb.com/rooms/44391   
4                    47061                https://www.airbnb.com/rooms/47061   
...                    ...                                               ...   
10163  1437492870456891135  https://www.airbnb.com/rooms/1437492870456891135   
10164  1437585511300942515  https://www.airbnb.com/rooms/1437585511300942515   
10165  1437601466510408215  https://www.airbnb.com/rooms/1437601466510408215   
10166  1437660079651924451  https://www.airbnb.com/rooms/1437660079651924451   
10167  1438602403155395239  https://www.airbnb.com/rooms/143860240315539