# Carga y tratado de datos

In [None]:
import os
import os.path

import cudf
import cupy as cp
import cuml

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

%run ../utils/f_northing.py
%run ../utils/f_northing_numpy.py

#cities_to_use = ['sevilla']
#cities_to_use = ['shanghai']
#cities_to_use = ['amsterdam', 'antwerp', 'asheville', 'athens', 'austin', 'sevilla', 'shanghai'] #World 1
cities_to_use = ['amsterdam', 'antwerp', 'asheville', 'athens', 'austin', 'bangkok', 'sevilla', 'shanghai'] #World 2

columns_to_use = ['id', 'host_id', 'host_response_rate', 'host_acceptance_rate',
                  'latitude', 'longitude', 'accommodates', 'price', 'number_of_reviews', 'reviews_per_month',
                 'neighbourhood_cleansed']

In [None]:
%%time
listings = cudf.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df = cudf.read_csv(directory + file, usecols = columns_to_use)
                if 'price' in temp_df.columns:
                    if(temp_df['host_acceptance_rate'].dtype != 'object'):
                        temp_df['host_acceptance_rate'] = temp_df['host_acceptance_rate'].astype('object')
                    if(temp_df['neighbourhood_cleansed'].dtype != 'object'):
                        temp_df['neighbourhood_cleansed'] = temp_df['neighbourhood_cleansed'].astype('object')
                    if listings.size == 0:
                        listings = temp_df
                    else:
                        for column in listings.columns:
                            if listings[column].dtype != temp_df[column].dtype:
                                print('Found error: '+column+' type '+listings[column].dtype.name+' doesnt match '+temp_df[column].dtype.name)
                        listings = listings.append(temp_df)
                    
listings = listings.drop_duplicates().reset_index()

listings['accommodates'] = listings['accommodates'].astype('int32')
listings['number_of_reviews'] = listings['number_of_reviews'].astype('int32')
listings['reviews_per_month'] = listings['reviews_per_month'].astype('float32').fillna(-1.0)
listings['neighbourhood_cleansed'], neighborhood_names = listings['neighbourhood_cleansed'].factorize()

listings['host_response_rate'] = listings['host_response_rate'].str.replace('%', '').fillna('-1').astype('int8')
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%', '').fillna('-1').astype('int8')
listings['price'] = listings['price'].str.replace(['$', ','], '').astype('float32')

cupy_lat = cp.asarray(listings['latitude'])
cupy_long = cp.asarray(listings['longitude'])
n_cupy_array, e_cupy_array = latlong2osgbgrid_cupy(cupy_lat, cupy_long)
listings['northing'] = cudf.Series(n_cupy_array).astype('float32')
listings['easting'] = cudf.Series(e_cupy_array).astype('float32')

listings.head()

# Aplicación de un algoritmo k-means para visualizar clusters de listados

In [None]:
%%time
dbscan = cuml.DBSCAN(eps=150)
price_df = listings[listings['price'] >= 200.0].reset_index()
price_df['cluster'] = dbscan.fit_predict(price_df[['northing', 'easting']])
price_df['cluster'].nunique()

# Visualización de resultados mediante cuXfilter

In [None]:
neighborhood_map = dict(zip(range(len(neighborhood_names)), neighborhood_names.values_host))
cxf_data = cxf.DataFrame.from_dataframe(price_df)

chart_width = 600
scatter_chart = cxf.charts.datashader.scatter(x='easting', y='northing', 
                                              width=chart_width, 
                                              height=int((listings['easting'].max() - listings['easting'].min()) / 
                                                         (listings['northing'].max() - listings['northing'].min()) *
                                                          chart_width))

widget = cxf.charts.panel_widgets.multi_select('cluster')

In [None]:
dashboard = cxf_data.dashboard([scatter_chart, widget], theme=cxf.themes.dark, data_size_widget=True)
dashboard.show('http://localhost', port=8789)

In [None]:
dashboard.stop()