In [1]:
import pandas as pd
import numpy as np
import collections

from mapboxgl.utils import df_to_geojson
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, MeanShift, estimate_bandwidth

In [2]:
def kmeans_cluster(cluster_df, cluster_num, write=True):
    latlon = np.array(cluster_df[['latitude', 'longitude']].to_dict('split')['data'])

    kmeans = KMeans(n_clusters=cluster_num, random_state=0).fit(latlon)
    cluster_centers = kmeans.cluster_centers_

    cluster_df['cluster_num'] = kmeans.labels_
    count_per_cluster = dict(collections.Counter(kmeans.labels_))

    clusterDF = pd.DataFrame.from_dict(data=cluster_centers)
    clusterDF.columns=['latitude','longitude']  

    cluster_info = {}
    for cluster, df in cluster_df.groupby(['cluster_num']):
        if cluster in cluster_info:
            cl_info = cluster_info[cluster]
            cl_info['room_type'] += df['room_type'].tolist()
            cl_info['price'] += df['price'].tolist()
            cl_info['id'] += df['id'].tolist()
        else:
            cluster_info[cluster] = {"room_type": df['room_type'].tolist(),
                                     "price": df['price'].tolist(),
                                     "id": df["id"].tolist()
                                    }

    clusterDF['index_num'] = clusterDF.index
    clusterDF['num_listings'] = clusterDF['index_num'].apply(lambda x: count_per_cluster[x])
    clusterDF['listings'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['id'])
    clusterDF['prices'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['price'])
    clusterDF['roomtypes'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['room_type'])

    if write:
        cluster_filename = 'kmeans_cluster{}.geojson'.format(cluster_num)
        df_to_geojson(clusterDF, filename=cluster_filename,
                      properties=['index_num', "num_listings"],
                      lat='latitude', lon='longitude', precision=6)
        df_filename = 'kmeans_dataframe{}.geojson'.format(cluster_num)
        df_to_geojson(cluster_df, filename=df_filename,
                      properties=['cluster_num', "room_type", "price", "name", "number_of_reviews"],
                      lat='latitude', lon='longitude', precision=6)

        print("FILENAME: {}".format(df_filename))
    return clusterDF

In [3]:
def agglomerative_cluster(cluster_df, cluster_num, write=True):
    latlon = np.array(cluster_df[['latitude', 'longitude']].to_dict('split')['data'])

    agglo_cluster = AgglomerativeClustering(n_clusters=cluster_num, affinity="euclidean").fit(latlon)

    number_of_clusters = len(np.unique(agglo_cluster.labels_))
    print('number of clusters', number_of_clusters)

    cluster_centers = np.zeros((number_of_clusters, 2))
    for i in range(0, number_of_clusters):
        cluster_points = latlon[agglo_cluster.labels_ == i]
        cluster_mean = np.mean(cluster_points, axis=0)
        cluster_centers[i, :] = cluster_mean

    cluster_df['cluster_num'] = agglo_cluster.labels_
    count_per_cluster = dict(collections.Counter(agglo_cluster.labels_))

    clusterDF = pd.DataFrame.from_dict(data=cluster_centers)
    clusterDF.columns=['latitude','longitude']  

    cluster_info = {}
    for cluster, df in cluster_df.groupby(['cluster_num']):
        if cluster in cluster_info:
            cl_info = cluster_info[cluster]
            cl_info['room_type'] += df['room_type'].tolist()
            cl_info['price'] += df['price'].tolist()
            cl_info['id'] += df['id'].tolist()
        else:
            cluster_info[cluster] = {"room_type": df['room_type'].tolist(),
                                     "price": df['price'].tolist(),
                                     "id": df["id"].tolist()
                                    }

    clusterDF['index_num'] = clusterDF.index
    clusterDF['num_listings'] = clusterDF['index_num'].apply(lambda x: count_per_cluster[x])
    clusterDF['listings'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['id'])
    clusterDF['prices'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['price'])
    clusterDF['roomtypes'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['room_type'])

    if write:
        filename = 'agglo_cluster{}.geojson'.format(cluster_num)

        df_to_geojson(clusterDF, filename=filename,
                      properties=['index_num', "num_listings"],
                      lat='latitude', lon='longitude', precision=6)

        print("FILENAME: {}".format(filename))
    return clusterDF

In [39]:
#DBSCAN
def dbscan_cluster(cluster_df, eps_dist, min_samples=1, write=True):
    latlon = np.array(cluster_df[['latitude', 'longitude']].to_dict('split')['data'])

    db_cluster = DBSCAN(eps=eps_dist, min_samples=min_samples).fit(latlon)
    labels = db_cluster.labels_

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('number of clusters', number_of_clusters)

    cluster_df['cluster_num'] = labels

    if write:
        df_filename = 'kmeans_dataframe{}.geojson'.format(cluster_num)
        df_to_geojson(cluster_df, filename=df_filename,
                      properties=['cluster_num', "room_type", "price", "name", "number_of_reviews"],
                      lat='latitude', lon='longitude', precision=6)
        print("FILENAME: {}".format(df_filename))
    return cluster_df

In [103]:
bandwidth = estimate_bandwidth(latlon)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True).fit(latlon)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

In [124]:
def gaussianmix_cluster(cluster_df, cluster_num):
    latlon = np.array(cluster_df[['latitude', 'longitude']].to_dict('split')['data'])

    gmm = GaussianMixture(n_components=cluster_num).fit(latlon)
    labels = gmm.predict(latlon)

    number_of_clusters = len(np.unique(labels))
    print('number of clusters', number_of_clusters)

    cluster_centers = np.zeros((number_of_clusters, 2))
    for i in range(0, number_of_clusters):
        cluster_centers[i, :] = gmm.means_[i]

    cluster_df['cluster_num'] = labels
    count_per_cluster = dict(collections.Counter(labels))

    clusterDF = pd.DataFrame.from_dict(data=cluster_centers)
    clusterDF.columns=['latitude','longitude']  

    cluster_info = {}
    for cluster, df in cluster_df.groupby(['cluster_num']):
        if cluster in cluster_info:
            cl_info = cluster_info[cluster]
            cl_info['room_type'] += df['room_type'].tolist()
            cl_info['price'] += df['price'].tolist()
            cl_info['id'] += df['id'].tolist()
        else:
            cluster_info[cluster] = {"room_type": df['room_type'].tolist(),
                                     "price": df['price'].tolist(),
                                     "id": df["id"].tolist()
                                    }

    clusterDF['index_num'] = clusterDF.index
    clusterDF['num_listings'] = clusterDF['index_num'].apply(lambda x: count_per_cluster[x])
    clusterDF['listings'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['id'])
    clusterDF['prices'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['price'])
    clusterDF['roomtypes'] = clusterDF['index_num'].apply(lambda x: cluster_info[x]['room_type'])

    filename = 'gaussianmix_cluster{}.geojson'.format(cluster_num)

    df_to_geojson(clusterDF, filename=filename,
                  properties=['index_num', "num_listings"],
                  lat='latitude', lon='longitude', precision=6)

    print("FILENAME: {}".format(filename))
    return clusterDF

In [40]:
airbnb_df = pd.read_csv('/Users/fionachow/Documents/Personal/DataScience/athens/listings 2.csv', encoding='utf-8')

In [41]:
airbnb_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,10595,https://www.airbnb.com/rooms/10595,20180516141048,2018-05-16,"96m2, 3BR, 2BA, Metro, WI-FI etc...",Athens Furnished Apartment No6 is 3-bedroom ap...,Athens Furnished Apartment No6 is an excellent...,Athens Furnished Apartment No6 is 3-bedroom ap...,none,Ampelokipi district is nice multinational and ...,...,f,,,t,f,strict_14_with_grace_period,f,f,8,0.18
1,10988,https://www.airbnb.com/rooms/10988,20180516141048,2018-05-16,"75m2, 2-br, metro, wi-fi, cable TV",Athens Furnished Apartment No4 is 2-bedroom ap...,Athens Furnished Apartment No4 is an excellent...,Athens Furnished Apartment No4 is 2-bedroom ap...,none,Ampelokipi district is nice multinational and ...,...,f,,,t,f,strict_14_with_grace_period,f,f,8,0.46
2,10990,https://www.airbnb.com/rooms/10990,20180516141048,2018-05-16,"50m2, Metro, WI-FI, cableTV, more",Athens Furnished Apartment No3 is 1-bedroom ap...,Athens Furnished Apartment No3 is an excellent...,Athens Furnished Apartment No3 is 1-bedroom ap...,none,Ampelokipi district is nice multinational and ...,...,f,,,t,f,strict_14_with_grace_period,f,f,8,0.35
3,10993,https://www.airbnb.com/rooms/10993,20180516141048,2018-05-16,"Studio, metro, cable tv, wi-fi, etc",The Studio is an -excellent located -close t...,"AQA No1 is an excellent located, close to metr...",The Studio is an -excellent located -close t...,none,Ampelokipi district is nice multinational and ...,...,f,,,t,f,strict_14_with_grace_period,f,f,8,0.57
4,10995,https://www.airbnb.com/rooms/10995,20180516141048,2018-05-16,"47m2, close to metro,cable TV,wi-fi",AQA No2 is 1-bedroom apartment (47m2) -excell...,"AQA No2 is an excellent located, close to metr...",AQA No2 is 1-bedroom apartment (47m2) -excell...,none,Ampelokipi district is nice multinational and ...,...,f,,,t,f,strict_14_with_grace_period,f,f,8,0.16


In [42]:
airbnb_df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [12]:
attractions_df = pd.read_csv("athens_attractions.csv")
attractions_df = attractions_df[attractions_df['Type']!='Bar']
attraction_latlon = attractions_df[['Lat', 'Long']].to_dict('split')['data']
travel_days = 3
travel_kmeans = KMeans(n_clusters=travel_days, random_state=0).fit(attraction_latlon)
travel_cluster_centers = travel_kmeans.cluster_centers_
attractions_df['travelday'] = travel_kmeans.labels_

In [13]:
attractions_df

Unnamed: 0,Attraction,Lat,Long,Type,travelday
0,Acropolis/ Parthenon,37.971482,23.726844,Attraction,1
1,Odeon of Herodes Atticus,37.970767,23.724554,Attraction,1
2,Plaka,37.97357,23.728003,Venture,1
3,Panathenaic Stadium,37.968224,23.74117,Attraction,0
4,Exarchia neighbourhood,37.986334,23.736394,Venture,2
5,TEMPLE OF OLYMPIAN ZEUS,37.969241,23.733127,Attraction,0
6,Theater of Dionysus,37.970349,23.72778,Attraction,1
7,Arch of Hadrian,37.970172,23.731985,Attraction,0
8,Psiri�neighbourhood,37.978683,23.724656,Venture,1
9,Monastiraki Market,37.976258,23.725555,Attraction,1


In [11]:
travel_cluster_centers

array([[37.97349275, 23.735669  ],
       [37.97184575, 23.7257825 ]])

In [36]:
db_cluster = DBSCAN(eps=0.005, min_samples=1).fit(attraction_latlon)
# Number of clusters in labels, ignoring noise if present.
labels = db_cluster.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_

3

In [37]:
db_cluster.

16