# Sección 1: K-means mediante GPU

## Carga y tratamiento de datos

In [None]:
import os
import os.path

import cudf
import cupy as cp
import cuml

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

%run ../utils/f_northing.py
%run ../utils/f_northing_numpy.py
%run ../utils/f_static_data.py
%run ../utils/f_utils.py

cities_to_use = ['sevilla']
#cities_to_use = ['shanghai']
#cities_to_use = cities_to_use_1()
#cities_to_use = cities_to_use_2()

columns_to_use = ['host_id', 'host_response_rate', 'host_acceptance_rate',
                  'latitude', 'longitude', 'accommodates', 'price', 'number_of_reviews', 'reviews_per_month',
                 'neighbourhood_cleansed']

In [None]:
%%time
listings = cudf.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df = cudf.read_csv(directory + file, usecols = columns_to_use)
                standard_object_type(temp_df, ['host_acceptance_rate', 'neighbourhood_cleansed'])
                if listings.size == 0:
                    listings = temp_df
                else:
                    for column in listings.columns:
                        if listings[column].dtype != temp_df[column].dtype:
                            print('Found error: '+column+' type '+listings[column].dtype.name+' doesnt match '+temp_df[column].dtype.name)
                    listings = listings.append(temp_df)
                    
listings = listings.drop_duplicates().reset_index(drop=True)

type_conversion(listings, ['host_id', 'accommodates', 'number_of_reviews', 'reviews_per_month'])
column_factorize(listings, ['neighbourhood_cleansed'])

clean_format_strings(listings, ['host_response_rate', 'host_acceptance_rate'])
clean_format_price(listings, ['price'])

cupy_lat = cp.asarray(listings['latitude'])
cupy_long = cp.asarray(listings['longitude'])
n_cupy_array, e_cupy_array = latlong2osgbgrid_cupy(cupy_lat, cupy_long)
listings['northing'] = cudf.Series(n_cupy_array).astype('float32')
listings['easting'] = cudf.Series(e_cupy_array).astype('float32')

listings.head()

## Aplicación de un algoritmo k-means para visualizar clusters de listados

In [None]:
%%time
km = cuml.KMeans(n_clusters=5)
km.fit(listings[['easting', 'northing']])
listings['kmeans'] = km.labels_
km.cluster_centers_

# Sección 2: K-means mediante CPU

## Carga y tratamiento de datos

In [None]:
%%time
listings_cpu = pd.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df_cpu = pd.read_csv(directory + file, usecols = columns_to_use)
                standard_object_type(temp_df_cpu, ['host_acceptance_rate', 'neighbourhood_cleansed'])
                if listings_cpu.size == 0:
                    listings_cpu = temp_df_cpu
                else:
                    for column in listings_cpu.columns:
                        if listings_cpu[column].dtype != temp_df_cpu[column].dtype:
                            print('Found error: '+column+' type '+listings_cpu[column].dtype.name+' doesnt match '+temp_df_cpu[column].dtype.name)
                    listings_cpu = listings_cpu.append(temp_df_cpu)
                    
listings_cpu = listings_cpu.drop_duplicates().reset_index(drop=True)

type_conversion(listings_cpu, ['host_id', 'accommodates', 'number_of_reviews', 'reviews_per_month'])
column_factorize(listings_cpu, ['neighbourhood_cleansed'])

clean_format_strings(listings_cpu, ['host_response_rate', 'host_acceptance_rate'])
clean_format_price_cpu(listings_cpu, ['price'])

numpy_lat = listings_cpu['latitude'].to_numpy()
numpy_long = listings_cpu['longitude'].to_numpy()
n_numpy_array, e_numpy_array = latlong2osgbgrid_numpy(numpy_lat, numpy_long)
listings_cpu['northing'] = pd.Series(n_numpy_array).astype('float32')
listings_cpu['easting'] = pd.Series(e_numpy_array).astype('float32')
listings_cpu.head()

## Aplicación de algoritmo k-means

In [None]:
%%time
km_cpu = KMeans(n_clusters=5)
km_cpu.fit(listings_cpu[['easting', 'northing']])
listings_cpu['kmeans'] = km_cpu.labels_
km_cpu.cluster_centers_

In [None]:
%reset -f

# Sección 3: Visualización de resultados mediante cuXfilter