# Sección 1: Regresión por GPU

## Carga y tratamiento de datos

In [None]:
import os
import os.path

import cudf
import cupy as cp
import cuml

import pandas as pd
import numpy as np
import sklearn

%run ../utils/f_northing.py
%run ../utils/f_northing_numpy.py
%run ../utils/f_price_range.py
%run ../utils/f_static_data.py

#cities_to_use = ['sevilla']
#cities_to_use = ['shanghai']
cities_to_use = cities_to_use()
columns_to_use = columns_to_use()
#columns_to_fit = ['northing', 'easting']
columns_to_fit = columns_to_fit()

cuml.set_global_output_type('cudf')

In [None]:
%%time
listings = cudf.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df = cudf.read_csv(directory + file, usecols = columns_to_use)
                if(temp_df['host_acceptance_rate'].dtype != 'object'):
                    temp_df['host_acceptance_rate'] = temp_df['host_acceptance_rate'].astype('object')
                if(temp_df['neighbourhood_cleansed'].dtype != 'object'):
                    temp_df['neighbourhood_cleansed'] = temp_df['neighbourhood_cleansed'].astype('object')
                if(temp_df['neighbourhood'].dtype != 'object'):
                    temp_df['neighbourhood'] = temp_df['neighbourhood'].astype('object')
                if(temp_df['host_total_listings_count'].dtype != 'int32'):
                    temp_df['host_total_listings_count'] = temp_df['host_total_listings_count'].astype('int32').fillna(-1)
                if(temp_df['bathrooms'].dtype != 'int32'):
                    temp_df['bathrooms'] = temp_df['bathrooms'].astype('int32').fillna(-1)
                if(temp_df['bedrooms'].dtype != 'int32'):
                    temp_df['bedrooms'] = temp_df['bedrooms'].astype('int32').fillna(-1)
                if(temp_df['beds'].dtype != 'int32'):
                    temp_df['beds'] = temp_df['beds'].astype('int32').fillna(-1)
                if listings.size == 0:
                    listings = temp_df
                else:
                    for column in listings.columns:
                        if listings[column].dtype != temp_df[column].dtype:
                            print('Found error: '+column+' type '+listings[column].dtype.name+' doesnt match '+temp_df[column].dtype.name)
                    listings = listings.append(temp_df)
                    
listings = listings.drop_duplicates().reset_index(drop=True)

listings['accommodates'] = listings['accommodates'].astype('int32').fillna(-1)
listings['number_of_reviews'] = listings['number_of_reviews'].astype('int32').fillna(-1)
listings['reviews_per_month'] = listings['reviews_per_month'].astype('float32').fillna(-1.0)

listings['neighbourhood'], neighborhood_names = listings['neighbourhood'].factorize()
listings['neighbourhood_cleansed'], neighborhood_names = listings['neighbourhood_cleansed'].factorize()
listings['host_response_time'], host_response_time = listings['host_response_time'].factorize()
listings['host_is_superhost'], host_is_superhost = listings['host_is_superhost'].factorize()
listings['host_has_profile_pic'], host_has_profile_pic = listings['host_has_profile_pic'].factorize()
listings['host_identity_verified'], host_identity_verified = listings['host_identity_verified'].factorize()
listings['property_type'], property_type = listings['property_type'].factorize()
listings['room_type'], room_type = listings['room_type'].factorize()

listings['host_response_rate'] = listings['host_response_rate'].str.replace('%', '').fillna('-1').astype('int8')
listings['host_acceptance_rate'] = listings['host_acceptance_rate'].str.replace('%', '').fillna('-1').astype('int8')
listings['price'] = listings['price'].str.replace(['$', ','], '').astype('float32').applymap(priceRange, 'float32')

cupy_lat = cp.asarray(listings['latitude'])
cupy_long = cp.asarray(listings['longitude'])
n_cupy_array, e_cupy_array = latlong2osgbgrid_cupy(cupy_lat, cupy_long)
listings['northing'] = cudf.Series(n_cupy_array).astype('float32')
listings['easting'] = cudf.Series(e_cupy_array).astype('float32')

listings.head()

## Aplicación de modelo de regresión para predicción de valores

In [None]:
%%time
regression = cuml.LinearRegression()
#regression = cuml.LogisticRegression()
#regression = cuml.Ridge()
x_train, x_test, y_train, y_test  = cuml.train_test_split(listings[columns_to_fit], listings['price'], train_size=0.9)
x_test_index = x_test.reset_index(drop=True)
y_test_index = y_test.reset_index(drop=True)
regression.fit(x_train, y_train)

Visualización de coeficientes y valor de intercepción tras el entrenamiento

In [None]:
coef_map = cudf.DataFrame()
coef_map['key'] = columns_to_fit
coef_map['value'] = regression.coef_
print("Coefficients:")
coef_map

#print("Intercept:")
#print(regression.intercept_)

Visualización de predicciones

In [None]:
%%time
predictions = regression.predict(x_test_index)
y_results = cudf.DataFrame()
y_results['prediction'] = predictions
y_results['real'] = y_test_index
y_results[0:10]

# Sección 2: Regresión por CPU

## Carga y tratamiento de datos

In [None]:
%%time
listings_cpu = pd.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df_cpu = pd.read_csv(directory + file, usecols = columns_to_use)
                if 'price' in temp_df.columns:
                    if(temp_df_cpu['host_acceptance_rate'].dtype != 'object'):
                        temp_df_cpu['host_acceptance_rate'] = temp_df_cpu['host_acceptance_rate'].astype('object')
                    if(temp_df_cpu['neighbourhood_cleansed'].dtype != 'object'):
                        temp_df_cpu['neighbourhood_cleansed'] = temp_df_cpu['neighbourhood_cleansed'].astype('object')
                    if listings_cpu.size == 0:
                        listings_cpu = temp_df_cpu
                    else:
                        for column in listings_cpu.columns:
                            if listings_cpu[column].dtype != temp_df_cpu[column].dtype:
                                print('Found error: '+column+' type '+listings_cpu[column].dtype.name+' doesnt match '+temp_df_cpu[column].dtype.name)
                        listings_cpu = listings_cpu.append(temp_df_cpu)
                    
listings_cpu = listings_cpu.drop_duplicates().reset_index(drop=True)

listings_cpu['accommodates'] = listings_cpu['accommodates'].astype('int32')
listings_cpu['number_of_reviews'] = listings_cpu['number_of_reviews'].astype('int32')
listings_cpu['reviews_per_month'] = listings_cpu['reviews_per_month'].astype('float32').fillna(-1.0)
listings_cpu['neighbourhood_cleansed'], neighborhood_names = listings_cpu['neighbourhood_cleansed'].factorize()

listings_cpu['host_response_rate'] = listings_cpu['host_response_rate'].fillna('-1').str.replace('%', '').astype('int32')
listings_cpu['host_acceptance_rate'] = listings_cpu['host_acceptance_rate'].fillna('-1').str.replace('%', '').astype('int32')
listings_cpu['price'] = listings_cpu['price'].fillna('-1').str.replace('$', '').str.replace(',', '').astype('float32')

numpy_lat = listings_cpu['latitude'].to_numpy()
numpy_long = listings_cpu['longitude'].to_numpy()
n_numpy_array, e_numpy_array = latlong2osgbgrid_numpy(numpy_lat, numpy_long)
listings_cpu['northing'] = pd.Series(n_numpy_array).astype('float32')
listings_cpu['easting'] = pd.Series(e_numpy_array).astype('float32')
listings_cpu.head()

## Aplicación de modelo de regresión

In [None]:
%%time
linreg_cpu = LinearRegression()
x_train_cpu, x_test_cpu, y_train_cpu, y_test_cpu  = train_test_split(listings_cpu[['northing', 'easting']], listings_cpu['price'], train_size=0.9)
x_test_cpu_index = x_test_cpu.reset_index(drop=True)
y_test_cpu_index = y_test_cpu.reset_index(drop=True)
linreg_cpu.fit(x_train_cpu, y_train_cpu)

In [None]:
print("Coefficients: [easting, northing]")
print([linreg_cpu.coef_[0], linreg_cpu.coef_[1]])

print("Intercept:")
print(linreg_cpu.intercept_)

In [None]:
%%time
predictions_cpu = linreg_cpu.predict(x_test_cpu_index)
predictions_cpu[0:5]

In [None]:
y_test_cpu_index.head()