In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd 
import numpy as np
from numpy import inf
from geopy.distance import vincenty
import json
!pip install numpyencoder
from numpyencoder import NumpyEncoder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting numpyencoder
  Downloading numpyencoder-0.3.0-py3-none-any.whl (3.0 kB)
Installing collected packages: numpyencoder
Successfully installed numpyencoder-0.3.0


In [11]:
data_path = '/content/drive/MyDrive/project/data/' 

## Preprocess data

In [24]:
def remove_outliers(df,cols, q1, q3):
    '''
    Function that removes the outliers of a set
    
    Inputs:
    - df: dataframe of the data
    - cols: The coloumns for wich the outliers need to be removed
    - q1: below whitc quantile to drop
    - q3: above which quantile to keep
    
    Output:
    The dataframe without the outliers
    '''
    
    Q1 = df[cols].quantile(q1)
    Q3 = df[cols].quantile(q3)
    IQR = Q3 - Q1

    df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
    return df

In [25]:
def get_neigh_groupings(df):
    '''
    A function that creates the neighbourhood groupings based on the neighbourhood prices
    
    Input:
    df: The dataframe with the data. Needs to have a column named 'neighbourhood_clensed' and one named 'price'
    
    Output:
    A dictionary with keys all the neighbourhoods and value the group they belong
    '''
    groups_neigh = df.groupby('neighbourhood_cleansed').agg(np.mean)['price']
    groups_neigh = groups_neigh.sort_values()

    groups_neigh.values
    groups_neigh_1 = groups_neigh[:6]
    groups_neigh_2 = groups_neigh[6:12]
    groups_neigh_3 = groups_neigh[12:18]
    groups_neigh_4 = groups_neigh[18:24]
    groups_neigh_5 = groups_neigh[24:30]
    groups_neigh_6 = groups_neigh[30:37]
    groups_neigh_7 = groups_neigh[37:44]

    d1 = dict.fromkeys(groups_neigh_1.index, '1')
    d2 = dict.fromkeys(groups_neigh_2.index, '2')
    d3 = dict.fromkeys(groups_neigh_3.index, '3')
    d4 = dict.fromkeys(groups_neigh_4.index, '4')
    d5 = dict.fromkeys(groups_neigh_5.index, '5')
    d6 = dict.fromkeys(groups_neigh_6.index, '6')
    d7 = dict.fromkeys(groups_neigh_7.index, '7')

    neigh_group = {**d1, **d2, **d3, **d4, **d5, **d6, **d7}
    return neigh_group


In [34]:
def missing_values_n_encoding(df, neigh_group):
    '''
    Function that fills missing values, encodes features and creates new features

    Inputs:
    - the dataframe and the neighbourhood grouping
    
    Outputs:
    The new dataframe without any missing valuees, encoded features and new features
    '''
    # Host feature handling
    df['host_is_superhost'] = df['host_is_superhost'].map({'t':1, 'f':0})
    df[['host_about', 'license']] = df[['host_about', 'license']].applymap(lambda x: 0 if pd.isnull(x) else 1)
    df['host_verifications'] = df['host_verifications'].apply(lambda row: len(row))
    df = df[~df['host_is_superhost'].isnull()]
    df['host_response_rate'] = df['host_response_rate'].str[:-1].astype('float64')
    df['host_response_rate'] = pd.cut(df['host_response_rate'], 
                                bins=[0, 50, 90, 99, 100], 
                                labels=['0-49%', '50-89%', '90-99%', '100%'], 
                                include_lowest=True)

    df['host_response_rate'] = df['host_response_rate'].astype('str')
    df['host_response_rate'] = df['host_response_rate'].replace('nan', 'unknown')
    df['host_response_time']= df['host_response_time'].fillna("unknown")


    # Listing features handling
    df['instant_bookable'] = df['instant_bookable'].map({'t':1, 'f':0})
    df['has_availability'] = df['has_availability'].map({'t':1, 'f':0})
    df['shared_bath'] = df['bathrooms_text'].apply(lambda s: 1 if 'shared' in str(s).split(' ') else 0) 
    df['bathrooms'] = df['bathrooms_text'].apply(lambda s: float(0.5) if 'half-bath' in str(s).lower() else float(str(s).split(' ')[0]))
    df['bathrooms'] = df['bathrooms'].fillna(0)


    # New features creation
    df['lat_center'] = 37.9715
    df['lon_center'] = 23.7257

    df['distance_parthenon'] = df.apply(lambda x: vincenty((x['latitude'], x['longitude']), (x['lat_center'], x['lon_center'])).km, axis = 1)
    df['amenities_number'] = df['amenities'].apply(lambda s: len(str(s)[1:].split(',')))
    df['neighbourhood_cleansed_group'] = df['neighbourhood_cleansed'].map(neigh_group)

    df = df.drop(columns=['lat_center','lon_center'])
    df = df.drop(columns=['bathrooms_text', 'amenities'])

    return df

In [35]:
def remove_outliers_per_room_type(df):
    '''
    Function that removes the outliers for the columns 'price', 'maximum_nights', 'minimum_nights' for each room type seperatetly

    Input:
    - Data frame to be processed

    Output:
    - The new dataframe without the outliers
    '''
    shared = remove_outliers(df[df['room_type']=='Shared room'], cols=['price', 'maximum_nights', 'minimum_nights'], q1=0.25, q3=0.75)
    priv = remove_outliers(df[df['room_type']=='Private room'], cols=['price', 'maximum_nights', 'minimum_nights'], q1=0.25, q3=0.75)
    hot = remove_outliers(df[df['room_type']=='Hotel room'],cols=['price', 'maximum_nights', 'minimum_nights'],q1=0.25, q3=0.75)
    home = remove_outliers(df[df['room_type']=='Entire home/apt'],cols=['price', 'maximum_nights', 'minimum_nights'],q1=0.25, q3=0.75)
    w_removed_out = pd.concat([shared,priv,hot,home], axis=0)
    
    return w_removed_out

In [29]:
def handle_amenities(df):
    '''
    Function that handles the 'amenities' feature. Checks if a list of amenities contains specific strings and if yes it puts a 1 in a new column

    Input:
    - The dataframe to be processed, needs to have a column named 'amenities'

    Output:
    A new dataframe with 26 new columns with boolean values for each type of amenity identified
    '''
    df.loc[df['amenities'].str.contains('kitchen', case=False), 'kitchen'] = 1
    df.loc[df['amenities'].str.contains('Air conditioning|Central air conditioning', case=False), 'air_conditioning'] = 1
    df.loc[df['amenities'].str.contains('Amazon Echo|Apple TV|Game console|Netflix|Projector and screen|Smart TV', case=False), 'high_end_electronics'] = 1
    df.loc[df['amenities'].str.contains('BBQ grill|Fire pit|Propane barbeque', case=False), 'bbq'] = 1
    df.loc[df['amenities'].str.contains('Balcony|Patio'), 'balcony'] = 1
    df.loc[df['amenities'].str.contains('Beach view|Beachfront|Lake access|Mountain view|Ski-in/Ski-out|Waterfront', case=False), 'nature_and_views'] = 1
    df.loc[df['amenities'].str.contains('Bed linens'), 'bed_linen'] = 1
    df.loc[df['amenities'].str.contains('Breakfast'), 'breakfast'] = 1
    df.loc[df['amenities'].str.contains('TV', case=False), 'tv'] = 1
    df.loc[df['amenities'].str.contains('Coffee maker|Espresso machine', case=False), 'coffee_machine'] = 1
    df.loc[df['amenities'].str.contains('Cooking basics', case=False), 'cooking_basics'] = 1
    df.loc[df['amenities'].str.contains('Elevator', case=False), 'elevator'] = 1
    df.loc[df['amenities'].str.contains('Exercise equipment|Gym|gym', case=False), 'gym'] = 1
    df.loc[df['amenities'].str.contains('Family/kid friendly|Children|children', case=False), 'child_friendly'] = 1
    df.loc[df['amenities'].str.contains('parking', case=False), 'parking'] = 1
    df.loc[df['amenities'].str.contains('Garden|Outdoor|Sun loungers|Terrace', case=False), 'outdoor_space'] = 1
    df.loc[df['amenities'].str.contains('Host greets you', case=False), 'host_greeting'] = 1
    df.loc[df['amenities'].str.contains('Hot tub|Jetted tub|hot tub|Sauna|Pool|pool', case=False), 'hot_tub_sauna_or_pool'] = 1
    df.loc[df['amenities'].str.contains('Internet|Pocket wifi|Wifi', case=False), 'internet'] = 1
    df.loc[df['amenities'].str.contains('Long term stays allowed', case=False), 'long_term_stays'] = 1
    df.loc[df['amenities'].str.contains('Pets|pet|Cat(s)|Dog(s)', case=False), 'pets_allowed'] = 1
    df.loc[df['amenities'].str.contains('Private entrance', case=False), 'private_entrance'] = 1
    df.loc[df['amenities'].str.contains('Safe|Security system', case=False), 'secure'] = 1
    df.loc[df['amenities'].str.contains('Self check-in', case=False), 'self_check_in'] = 1
    df.loc[df['amenities'].str.contains('Smoking allowed', case=False), 'smoking_allowed'] = 1
    df.loc[df['amenities'].str.contains('Step-free access|Wheelchair|Accessible', case=False), 'accessible'] = 1
    df.loc[df['amenities'].str.contains('Suitable for events', case=False), 'event_suitable'] = 1   
    
    cols_to_replace_nulls = df.columns
    df[cols_to_replace_nulls] = df[cols_to_replace_nulls].fillna(0)
    df.drop(columns=['amenities'], axis=1, inplace=True)

    return df

In [36]:
import json 
def preprocess(df):
    '''
    Function that combines all the preprocessing steps. It makes the price float, removes outliers based on room types, 
    handles amenities, fills the missing values, encodes some features and creates the new features

    Input:
    - The dataframe to be preprocessed

    Output:
    - The preprocessed dataframe
    '''
    df['price']= df['price'].replace('[\$,]', '', regex=True).astype(float)
    df = remove_outliers_per_room_type(df)
    amenities = handle_amenities(df[['amenities']])
    df = pd.concat([df,amenities], axis=1)
    neigh_mappings = get_neigh_groupings(df)
    df = missing_values_n_encoding(df,neigh_mappings)
    return df

In [None]:
data = pd.read_csv(data_path + '/listings.csv')
data = data.drop(columns=['listing_url', 'scrape_id', 'last_scraped', 'name','description', 'picture_url', 'host_url', 'host_name', 
                         'host_location','host_thumbnail_url','host_picture_url','host_neighbourhood', 'neighbourhood', 'host_total_listings_count',
                         'calendar_updated','bathrooms', 'first_review','last_review', 'calendar_last_scraped',
                         'minimum_minimum_nights','maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm',
                         'maximum_nights_avg_ntm','calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
                         'calculated_host_listings_count_shared_rooms','host_acceptance_rate', 'neighbourhood_group_cleansed',
                         'availability_60','number_of_reviews_ltm','number_of_reviews_l30d', 
                         'host_listings_count','host_since', 'host_id', 'id','availability_30', 'availability_365', 'reviews_per_month'])
data_proc = preprocess(data)
data_proc = data_proc.drop(columns=['neighborhood_overview', 'bedrooms', 'beds'])

In [38]:
data_proc['price'].describe()

count    8000.000000
mean       57.690875
std        30.175787
min         9.000000
25%        36.000000
50%        50.000000
75%        71.000000
max       275.000000
Name: price, dtype: float64

## Mean Price per Neighbourhood - BarChart




In [45]:
groups_neigh = data_proc.groupby('neighbourhood_cleansed').agg(np.mean)['price']
groups_neigh = groups_neigh.sort_values()

In [46]:
groups_neigh_1 = groups_neigh[:6]
groups_neigh_2 = groups_neigh[6:12]
groups_neigh_3 = groups_neigh[12:18]
groups_neigh_4 = groups_neigh[18:24]
groups_neigh_5 = groups_neigh[24:30]
groups_neigh_6 = groups_neigh[30:37]
groups_neigh_7 = groups_neigh[37:44]

In [47]:
d1 = dict.fromkeys(groups_neigh_1.index, '1')
d2 = dict.fromkeys(groups_neigh_2.index, '2')
d3 = dict.fromkeys(groups_neigh_3.index, '3')
d4 = dict.fromkeys(groups_neigh_4.index, '4')
d5 = dict.fromkeys(groups_neigh_5.index, '5')
d6 = dict.fromkeys(groups_neigh_6.index, '6')
d7 = dict.fromkeys(groups_neigh_7.index, '7')
d = {**d1, **d2, **d3, **d4, **d5, **d6, **d7}

In [48]:
data_proc['neigh_group'] = data_proc['neighbourhood_cleansed'].map(d)

In [49]:
groups_neigh_group = data_proc.groupby('neigh_group').agg(np.mean)['price']
groups_neigh_group = groups_neigh_group.sort_values()

In [50]:
d_barChart_all_neigh = {
   'price': groups_neigh.values.round(0),
   'neighbourhood': groups_neigh.index
}
barChart_all_neigh = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_barChart_all_neigh.items()
])]

In [51]:
d_treeMap_all_neigh = {
   'neighbourhood': groups_neigh.index,
    'price': groups_neigh.values.round(2)
}
treeMap_all_neigh = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_treeMap_all_neigh.items()
])]

In [52]:
d_barChart_neigh_groups = {
   'price': groups_neigh_group.values.round(0),
   'group':  'Neighbourhood group ' + groups_neigh_group.index
}
barChart_neigh_groups = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_barChart_neigh_groups.items()
])]

In [53]:
d_treeMap_neigh_groups = {
    'name':  'Group ' + groups_neigh_group.index,
   'price': groups_neigh_group.values.round(0) 
}
treeMap_neigh_groups = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_treeMap_neigh_groups.items()
])]

## Mean price of Airbnbs accommodating different number of guests - BarChart


In [54]:
groups_accom = data_proc.groupby('accommodates').agg(np.mean)['price']
groups_accom = groups_accom.sort_values()

In [55]:
d_barChart_accom = {
   'accommodates': groups_accom.index,
   'price': groups_accom.values.round(0)
}
barChart_acom = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_barChart_accom.items()
])]

## Prices - lineChart, AreaChart


In [56]:
groups_prices = data_proc['price'].value_counts()
groups_prices = groups_prices.sort_index()

In [58]:
d_barChart_prices = {
   'price': groups_prices.index,
   'num': groups_prices.values
}
barChart_prices = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_barChart_prices.items()
])]

## Room type, Price - Pie

In [60]:
groups_room_type = data_proc.groupby('room_type').agg(np.mean)['price'].round(2)
groups_room_type = groups_room_type.sort_values()

In [62]:
d_pieChart_room_type = {
   'type': groups_room_type.index,
   'price': groups_room_type.values
}

pieChart_room_type = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_pieChart_room_type.items()
])]

In [63]:
room_type_count = data_proc['room_type'].value_counts()

In [64]:
d_pieChart_room_type_count = {
   'type': room_type_count.index,
   'count': room_type_count.values
}
pieChart_room_type_count = [dict(j) for j in zip( * [
   [(a, i) for i in b]
   for a, b in d_pieChart_room_type_count.items()
])]
pieChart_room_type_count = [{'type': 'Shared room', 'count': 52},{'type': 'Entire home/apt', 'count': 7105},
 {'type': 'Private room', 'count': 728},{'type': 'Hotel room', 'count': 115}]

## Export stats

In [65]:
jsonDict = {
  'barChart_all_neigh': barChart_all_neigh,
  'barChart_groups': barChart_neigh_groups,
  'treeMap_all_neigh': treeMap_all_neigh,
  'treeMap_groups': treeMap_neigh_groups,
  'barChart_acom': barChart_acom,
  'barChart_prices': barChart_prices,
  'pieChart_room_type': pieChart_room_type,
  'pieChart_room_type_count': pieChart_room_type_count
}

In [66]:
with open('/content/drive/MyDrive/project/json_test.json', 'w', encoding='utf-8') as file:
    json.dump(jsonDict, file, ensure_ascii=False, cls=NumpyEncoder)

## Export Host info

In [67]:
df = pd.read_csv(data_path + '/listings.csv')

In [68]:
df_hosts = df[['host_id',  'host_about', 'host_response_time', 'host_response_rate',
        'host_is_superhost', 'host_verifications', 
        'calculated_host_listings_count','host_identity_verified','host_has_profile_pic']]

In [71]:
df_hosts = df_hosts.drop_duplicates()

In [72]:
df_hosts.to_csv(data_path + '/host_info.csv')  