# Sección 1

## Carga y tratamiento de datos

In [1]:
import os
import os.path

import cudf
import cupy as cp
import cuml
from cuml.linear_model import LogisticRegression, MBSGDClassifier, MBSGDRegressor
from cuml.multiclass import MulticlassClassifier
from cuml.naive_bayes import MultinomialNB
from cuml.ensemble import RandomForestClassifier
from cuml.svm import SVC
from cuml.metrics.regression import mean_squared_error as mnsq

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mnsq_cpu

%run ../utils/f_northing.py
%run ../utils/f_northing_numpy.py
%run ../utils/f_price_range.py
%run ../utils/f_static_data.py
%run ../utils/f_utils.py

cities_to_use = ['sevilla']
#cities_to_use = ['shanghai']
#cities_to_use = cities_to_use_1()
#cities_to_use = cities_to_use_2()
columns_to_use = columns_to_use()
columns_to_fit = columns_to_fit()

cuml.set_global_output_type('cudf')

64 bit

In [None]:
%%time
listings = cudf.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df = cudf.read_csv(directory + file, usecols = columns_to_use)
                standard_object_type(temp_df, ['host_acceptance_rate', 'neighbourhood_cleansed'])
                if(temp_df['host_total_listings_count'].dtype != 'float64'):
                    temp_df['host_total_listings_count'] = temp_df['host_total_listings_count'].fillna(-1).astype('float64')
                if(temp_df['bathrooms'].dtype != 'float64'):
                    temp_df['bathrooms'] = temp_df['bathrooms'].fillna(-1).astype('float64')
                if(temp_df['bedrooms'].dtype != 'float64'):
                    temp_df['bedrooms'] = temp_df['bedrooms'].fillna(-1).astype('float64')
                if(temp_df['beds'].dtype != 'float64'):
                    temp_df['beds'] = temp_df['beds'].fillna(-1).astype('float64')
                if listings.size == 0:
                    listings = temp_df
                else:
                    for column in listings.columns:
                        if listings[column].dtype != temp_df[column].dtype:
                            print('Found error: '+column+' type '+listings[column].dtype.name+' doesnt match '+temp_df[column].dtype.name)
                    listings = listings.append(temp_df)
                    
listings = listings.drop_duplicates()
listings = listings.reset_index(drop=True)

type_conversion_64(listings, ['host_id', 'accommodates', 'number_of_reviews', 'reviews_per_month', 'minimum_nights', 'maximum_nights', 'availability_30', 'availability_90', 'availability_365', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_total_listings_count', 'bathrooms', 'bedrooms', 'beds'])
column_factorize_64(listings, ['neighbourhood_cleansed', 'host_response_time', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'property_type', 'room_type', 'instant_bookable'])

clean_format_strings_64(listings, ['host_response_rate', 'host_acceptance_rate'])
clean_format_price_64(listings, ['price'])
listings['price'] = listings['price'].applymap(priceRange, 'float64')

cupy_lat = cp.asarray(listings['latitude'])
cupy_long = cp.asarray(listings['longitude'])
n_cupy_array, e_cupy_array = latlong2osgbgrid_cupy(cupy_lat, cupy_long)
listings['northing'] = cudf.Series(n_cupy_array).astype('float64')
listings['easting'] = cudf.Series(e_cupy_array).astype('float64')

listings.head()

32 bit

In [2]:
%%time
listings = cudf.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df = cudf.read_csv(directory + file, usecols = columns_to_use)
                standard_object_type(temp_df, ['host_acceptance_rate', 'neighbourhood_cleansed'])
                if(temp_df['host_total_listings_count'].dtype != 'float32'):
                    temp_df['host_total_listings_count'] = temp_df['host_total_listings_count'].fillna(-1).astype('float32')
                if(temp_df['bathrooms'].dtype != 'float32'):
                    temp_df['bathrooms'] = temp_df['bathrooms'].fillna(-1).astype('float32')
                if(temp_df['bedrooms'].dtype != 'float32'):
                    temp_df['bedrooms'] = temp_df['bedrooms'].fillna(-1).astype('float32')
                if(temp_df['beds'].dtype != 'float32'):
                    temp_df['beds'] = temp_df['beds'].fillna(-1).astype('float32')
                if listings.size == 0:
                    listings = temp_df
                else:
                    for column in listings.columns:
                        if listings[column].dtype != temp_df[column].dtype:
                            print('Found error: '+column+' type '+listings[column].dtype.name+' doesnt match '+temp_df[column].dtype.name)
                    listings = listings.append(temp_df)
                    
listings = listings.drop_duplicates()
listings = listings.reset_index(drop=True)

type_conversion(listings, ['host_id', 'latitude', 'longitude', 'accommodates', 'number_of_reviews', 'reviews_per_month', 'minimum_nights', 'maximum_nights', 'availability_30', 'availability_90', 'availability_365', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_total_listings_count', 'bathrooms', 'bedrooms', 'beds'])
column_factorize(listings, ['neighbourhood_cleansed', 'host_response_time', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'property_type', 'room_type', 'instant_bookable'])

clean_format_strings(listings, ['host_response_rate', 'host_acceptance_rate'])
clean_format_price(listings, ['price'])
listings['price'] = listings['price'].applymap(priceRange, 'float32')
type_conversion(listings, ['price'])

cupy_lat = cp.asarray(listings['latitude'])
cupy_long = cp.asarray(listings['longitude'])
n_cupy_array, e_cupy_array = latlong2osgbgrid_cupy(cupy_lat, cupy_long)
listings['northing'] = cudf.Series(n_cupy_array).astype('float32')
listings['easting'] = cudf.Series(e_cupy_array).astype('float32')

listings.head()

CPU times: user 2.01 s, sys: 364 ms, total: 2.38 s
Wall time: 2.38 s


Unnamed: 0,host_id,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,neighbourhood_cleansed,latitude,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,reviews_per_month,northing,easting
0,84759.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,1.0,67.0,37.40358,...,10.0,10.0,10.0,10.0,8.0,10.0,1.0,0.15,-1380266.0,47685.261719
1,84759.0,-1.0,-1.0,-1.0,0.0,1.0,1.0,1.0,67.0,37.40358,...,10.0,10.0,10.0,10.0,8.0,10.0,1.0,0.23,-1380266.0,47685.261719
2,84759.0,-1.0,-1.0,100.0,0.0,1.0,1.0,1.0,67.0,37.40358,...,10.0,10.0,10.0,10.0,8.0,10.0,1.0,0.16,-1380266.0,47685.261719
3,84759.0,-1.0,-1.0,60.0,0.0,1.0,1.0,1.0,67.0,37.40358,...,10.0,10.0,10.0,10.0,8.0,10.0,1.0,0.17,-1380266.0,47685.261719
4,84759.0,-1.0,-1.0,60.0,0.0,1.0,1.0,1.0,67.0,37.40358,...,10.0,10.0,10.0,10.0,8.0,10.0,1.0,0.19,-1380266.0,47685.261719


## Aplicación de un algoritmo aun por determinar

In [5]:
#model = MBSGDClassifier(learning_rate='constant', eta0=0.05, epochs=2000, fit_intercept=True, batch_size=1, tol=0.0, penalty='l2', loss='squared_loss', alpha=0.5) #todo 0
#model = MBSGDRegressor(learning_rate='constant', eta0=0.05, epochs=2000, fit_intercept=True, batch_size=1, tol=0.0, penalty='l2', loss='squared_loss', alpha=0.5) #todo NaN
#model = MulticlassClassifier(LogisticRegression(), strategy='ovo') #L-BFGS line search failed - inviable
#model = MultinomialNB() #todo 0
#model = cuml.SGD(learning_rate='constant', eta0=0.005, epochs=2000, fit_intercept=True, batch_size=2,tol=0.0, penalty='none', loss='squared_loss') #todo NaN
#model = RandomForestClassifier(max_features=1.0, n_bins=8, n_estimators=40) #categorías deben ser consecutivas
model = cuml.CD(alpha=0.0) #OK
#model = cuml.QN(loss='softmax') #OK pero no ideal
#model = SVC(kernel='poly', degree=2, gamma='auto', C=1) #error de memoria

In [6]:
%%time
X = listings[['northing', 'easting']][0:2000]
y = listings['price'][0:2000]
x_train, x_test, y_train, y_test  = cuml.train_test_split(X, y, train_size=0.9)
model.fit(x_train, y_train)

[E] [17:16:52.159354] L-BFGS line search failed
CPU times: user 151 ms, sys: 73.6 ms, total: 225 ms
Wall time: 237 ms


QN(loss='softmax', fit_intercept=True, l1_strength=0.0, l2_strength=0.0, max_iter=1000, tol=0.001, linesearch_max_iter=50, lbfgs_memory=5, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7f006a8c2b90>, output_type='cudf')

In [7]:
%%time
predictions = model.predict(x_test)

CPU times: user 6.68 ms, sys: 0 ns, total: 6.68 ms
Wall time: 5.27 ms


In [8]:
y_test = y_test.reset_index(drop=True)

In [9]:
testdf = cudf.DataFrame()
testdf['pred'] = predictions.astype('float64')
testdf['real'] = y_test.astype('float64')
testdf.head()

Unnamed: 0,pred,real
0,2.0,6.0
1,2.0,1.0
2,2.0,3.0
3,2.0,2.0
4,2.0,3.0


In [10]:
testdf.describe(include='all')

Unnamed: 0,pred,real
count,200.0,200.0
mean,2.0,2.175
std,0.0,1.412213
min,2.0,0.0
25%,2.0,1.0
50%,2.0,2.0
75%,2.0,2.0
max,2.0,9.0


In [11]:
loss = mnsq(testdf['real'], testdf['pred'])
print(loss)

2.015


# Sección 2

## Carga y tratamiento de datos

In [None]:
%%time
listings_cpu = pd.DataFrame()

for city in cities_to_use:
    directory = '../data/' + city + '/'
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if file.endswith('.csv'):
                temp_df_cpu = pd.read_csv(directory + file, usecols = columns_to_use)
                standard_object_type(temp_df_cpu, ['host_acceptance_rate', 'neighbourhood_cleansed'])
                if(temp_df_cpu['host_total_listings_count'].dtype != 'float64'):
                    temp_df_cpu['host_total_listings_count'] = temp_df_cpu['host_total_listings_count'].fillna("-1").astype('float64')
                if(temp_df_cpu['bathrooms'].dtype != 'float64'):
                    temp_df_cpu['bathrooms'] = temp_df_cpu['bathrooms'].fillna("-1").astype('float64')
                if(temp_df_cpu['bedrooms'].dtype != 'float64'):
                    temp_df_cpu['bedrooms'] = temp_df_cpu['bedrooms'].fillna("-1").astype('float64')
                if(temp_df_cpu['beds'].dtype != 'float64'):
                    temp_df_cpu['beds'] = temp_df_cpu['beds'].fillna("-1").astype('float64')
                if listings_cpu.size == 0:
                    listings_cpu = temp_df_cpu
                else:
                    for column in listings_cpu.columns:
                        if listings_cpu[column].dtype != temp_df_cpu[column].dtype:
                            print('Found error: '+column+' type '+listings_cpu[column].dtype.name+' doesnt match '+temp_df_cpu[column].dtype.name)
                    listings_cpu = listings_cpu.append(temp_df_cpu)
                    
listings_cpu = listings_cpu.drop_duplicates()
listings_cpu = listings_cpu.reset_index(drop=True)

type_conversion_64(listings_cpu, ['host_id', 'accommodates', 'number_of_reviews', 'reviews_per_month', 'minimum_nights', 'maximum_nights', 'availability_30', 'availability_90', 'availability_365', 'number_of_reviews_ltm', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'host_total_listings_count', 'bathrooms', 'bedrooms', 'beds'])
column_factorize_64(listings_cpu, ['neighbourhood_cleansed', 'host_response_time', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'property_type', 'room_type', 'instant_bookable'])

clean_format_strings_64(listings_cpu, ['host_response_rate', 'host_acceptance_rate'])
clean_format_price_64_cpu(listings_cpu, ['price'])
listings_cpu['price'] = listings_cpu['price'].apply(priceRange, 'float64')

numpy_lat = listings_cpu['latitude'].to_numpy()
numpy_long = listings_cpu['longitude'].to_numpy()
n_numpy_array, e_numpy_array = latlong2osgbgrid_numpy(numpy_lat, numpy_long)
listings_cpu['northing'] = pd.Series(n_numpy_array).astype('float64')
listings_cpu['easting'] = pd.Series(e_numpy_array).astype('float64')
listings_cpu.head()

## Aplicación de algoritmo

In [None]:
%reset -f