In [1]:
!python -V

Python 3.9.5


# Predicting AirBnb Search Location Based on Traits
A very basic example of how to prepare and use the AirBnb dataset from [gumdropsteve/datasets](https://github.com/gumdropsteve/datasets).

In [None]:
import os

import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_parquet('https://github.com/gumdropsteve/datasets/raw/master/airbnb/las_vegas.parquet')

df['search_location'] = ' '.join([_.capitalize() for _ in 'las_vegas'.split('_')])

df.columns

Index(['url', 'title', 'type', 'location', 'guests', 'bedrooms', 'beds',
       'is_studio', 'baths', 'half_baths', 'shared_baths', 'price',
       'avg_rating', 'n_reviews', 'gym_bool', 'wifi_bool',
       'self_check_in_bool', 'air_conditioning_bool', 'pets_allowed_bool',
       'indoor_fireplace_bool', 'hot_tub_bool', 'free_parking_bool',
       'pool_bool', 'kitchen_bool', 'breakfast_bool', 'elevator_bool',
       'washer_bool', 'dryer_bool', 'heating_bool', 'waterfront_bool',
       'dishwasher_bool', 'beachfront_bool', 'ski_in_ski_out_bool', 'ds',
       'search_filter', 'terrace_bool', 'sonos_sound_system_bool',
       'bbq_grill_bool', 'hair_dryer_bool', 'chefs_kitchen_bool',
       'wet_bar_bool', 'sun_loungers_bool', 'home_theater_bool',
       'housekeeping_bool', 'gated_property_bool', 'gas_fireplace_bool',
       'plunge_pool_bool', 'infinity_pool_bool', 'sun_deck_bool',
       'game_room_bool', 'surround_sound_system_bool', 'resort_access_bool',
       'search_location'],

In [3]:
len(df), len(df.drop_duplicates()), len(df.drop_duplicates(subset='url'))

(83204, 81759, 2152)

In [4]:
location_files = [_ for _ in os.listdir() if '.parquet' in _ and 'las_vegas' not in _]

for file in location_files:
    
    print(' '.join([_.capitalize() for _ in file.replace('.parquet', '').split('_')]))
    
    new_location = pd.read_parquet(file)
    new_location['search_location'] = ' '.join([_.capitalize() for _ in file.replace('.parquet', '').split('_')])

    df = pd.concat([df, new_location])

# len(df)

Murcia
Florida Keys
Canary Wharf
Miami
Saudi Arabia
North Pole
Can Tho
Tbilisi
Paradise
Houston
Riyadh
Bella Vista
Seattle
Florida
Dammam
Henderson
Alaska
Louisiana
New Orleans
United Kingdom
Budapest
Georgia Country
Seoul
Tokyo
Texas
Coeur Dalene
Melbourne
St Thomas
Ho Chi Minh
Japan
Pennsylvania
California
Sydney
Baton Rouge
Virgin Islands
San Francisco
Little Rock
Honolulu
Anchorage
Australia
Jeddah
Arkansas
Boise
Vietnam
Barcelona
Bentonville
Pho Quoc Island
Hungary
United States
Dallas
Austin
Palma
San Diego
Shreveport
City Of London
Camden Town
London
Oakland
Idaho
Nevada
Reno
Philadelphia


In [5]:
len(df)

3002446

In [6]:
df = df.drop_duplicates(subset='url')

In [7]:
len(df), len(df.columns)

(92747, 53)

In [8]:
df = pd.concat([df, 
                pd.get_dummies(df['type']), 
                pd.get_dummies(df['location']),
                pd.get_dummies(df['search_filter'])
               ], axis=1)

df = df.drop(['type', 'location', 'search_filter'], axis=1)

df = df.drop(['url', 'title', 'ds'], axis=1)

In [9]:
len(df), len(df.columns)

(92747, 13146)

In [10]:
df = df.fillna(False)

In [12]:
x = df.drop(['search_location'], axis=1)
y = df['search_location']

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0, test_size=0.2)

In [14]:
knn = KNeighborsClassifier(n_neighbors=5)

In [15]:
knn.fit(x_train, y_train)

KNeighborsClassifier()

In [16]:
knn.score(x_test, y_test)

0.31859838274932617