## Hotels Challenge I.

given a database of hotels, and a set of input coordinates, for each coordinate, find the hotel closest to it

a solution is represented by a directory

the directory must contain one [yaml](https://en.wikipedia.org/wiki/YAML) file:
`commands.yaml`

and in the file, on the top level, 4 keys can have values: `setup-env-command`, `etl-command`, `process-command`
and `cleanup-command`. 

- `setup-env-command` sets up the environment where the other commans can run. it can assume the presence of python3.7 and pip
- `etl-command` runs, when the data is already accessible by the solution, in this case in a `hotel_table.csv` in the root of the solution. the etl command can do whatever it wants with the data to prepare it for the process command
- when `process-command` runs, an additional `inputs.json` file is also present in the solution root. your task is to make this command write out the answers to the queries found in inputs into an `outputs.json` file in the root of the solution, as fast as possible. this is the only mandatory value
- `cleanup-command` runs after everything is done


solutions will be avaluated based on:
- scaling with size of input
- scaling with data size

there are 4 levels for evaluation:
- 10k hotels - 1, 2, 5, 10 queries
- 5 queries - 10k, 50k, 100k, 200k hotels
- 50k hotels - 1, 10, 100, 1000 queries
- 500k hotels - 1, 10, 100, 1000 queries

### install package for data downloading and evaluation

In [None]:
#!pip install --upgrade git+https://github.com/endreMBorza/jkg_evaluators

In [62]:
from jkg_evaluators.challenges.data.hotels import get_hotel_data, dump_hotel_input
import shutil
import os

### download practice data

In [None]:
#get_hotel_data()

### select one and move to notebook root

In [63]:
data_size_to_copy = 10000
shutil.copyfile(os.path.join("data", 
                             f"{data_size_to_copy}.csv"), 
                "data.csv")

'data.csv'

### generate some inputs

In [64]:
dump_hotel_input(size=1000, path="inputs.json")

## base solution ETL

In [6]:
%%time
import pandas as pd

data_file_path = "data.csv"

df = pd.read_csv(data_file_path)

df.loc[:, ["lon", "lat", "name"]].to_csv("filtered.csv", index=None)


NameError: name 'kdtree' is not defined

## function solution ETL

In [65]:
%%time
import pandas as pd
data_file_path = "data.csv"

df = pd.read_csv(data_file_path)

df.loc[:, ["lon", "lat", "name"]].to_pickle("filtered.pkl")

Wall time: 60.5 ms


## ETL 3

In [31]:
%%time
import pandas as pd
import pickle
from sklearn.neighbors import KDTree
import numpy as np
data_file_path = "data.csv"

df = pd.read_csv(data_file_path)

df.loc[:, ["lon", "lat", "name"]].to_pickle("filtered.pkl")

tree = KDTree(np.deg2rad(df[['lat', 'lon']].values), metric = 'euclidean', leaf_size = 400)  
try:
    open("tree.obj",'rb')
except:
    
    with open('tree.obj', 'wb') as f:
        pickle.dump(tree, f, pickle.HIGHEST_PROTOCOL)

Wall time: 65 ms


## base solution process

In [None]:
%%time
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))

df = pd.read_csv('filtered.csv')

answers = []

for place in input_locations:
    min_distance = np.inf
    closest_place = {}
    for idx,row in df.iterrows():
        distance = ((place['lon']-row['lon']) ** 2 + (place['lat']-row['lat']) ** 2) ** 0.5
        if distance < min_distance:
            min_distance = distance
            closest_place = row[['lon','lat','name']].to_dict()
    answers.append(closest_place.copy())

json.dump(answers,open('outputs.json','w'))

In [None]:
answers

## iterrow solution process

In [None]:
%%time
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))

df = pd.read_pickle('filtered.pkl')

answers = []

min_distances = [np.inf] * len(input_locations)
answers = [{}] * len(input_locations)

for idx,row in df.iterrows():
    
    for lidx, place in enumerate(input_locations):
        
        distance = ((place['lon']-row['lon']) ** 2 + (place['lat']-row['lat']) ** 2) ** 0.5
        
        if distance < min_distances[lidx]:
            min_distances[lidx] = distance
            answers[lidx] = row[['lon','lat','name']].to_dict()

json.dump(answers,open('outputs.json','w'))

## With itertuple (and pickle, iterrow optimalized)

In [None]:
%%time
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))

df = pd.read_pickle('filtered.pkl')


min_distances = [np.inf] * len(input_locations)
answers = [{}] * len(input_locations)


for row in df.itertuples():

    for lidx, place in enumerate(input_locations):

        distance = ((place['lon']-row[1]) ** 2 + (place['lat']-row[2]) ** 2) ** 0.5

        if distance < min_distances[lidx]:
            min_distances[lidx] = distance
            answers[lidx] = {'lon':row[1],'lat':row[2],'name':row[3]}
    
json.dump(answers,open('outputs.json','w'))

## With k-nearest neighbour

In [43]:
%%time
#good
import numpy as np
from sklearn.neighbors import BallTree
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))
df = pd.read_pickle('filtered.pkl')

answers = []
query_all = []

[query_all.append(list(dic.values())) for dic in input_locations]
            
bt = BallTree(np.deg2rad(df[['lat', 'lon']].values), metric = 'euclidean', leaf_size = 400)

dist, ind = bt.query(np.deg2rad(query_all))   

[answers.append(df[['lon', 'lat', 'name']].iloc[i].to_dict('records')[0]) for i in ind]
    
json.dump(answers,open('outputs.json','w'))

Wall time: 2.39 s


In [119]:
%%time
#best1
from sklearn.neighbors import KDTree
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))
df = pd.read_pickle('filtered.pkl')
answers, query_all = [], []
[query_all.append(tuple(dic.values())) for dic in input_locations]
tree = KDTree(np.deg2rad(df[['lat', 'lon']].values), metric = 'euclidean')
dist, ind = tree.query(np.deg2rad(query_all))
[answers.append(dict(df.iloc[i[0]])) for i in ind]
json.dump(answers,open('outputs.json','w'))

Wall time: 280 ms


In [115]:
%%time
#best2
from sklearn.neighbors import KDTree
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))
df = pd.read_pickle('filtered.pkl')
query_all = list(map(lambda dic: tuple(dic.values()), input_locations))
tree = KDTree(np.deg2rad(df[['lat', 'lon']].values), metric = 'euclidean')
dist, ind = tree.query(np.deg2rad(query_all))
answers = list(map(lambda i: dict(df.iloc[i[0]]), ind))
json.dump(answers,open('outputs.json','w'))

Wall time: 280 ms


## EZ alatt sok faszsÃ¡g van csak

In [None]:
%load_ext Cython

In [None]:
import setuptools

In [None]:
input_locations = json.load(open('inputs.json', 'r'))

df = pd.read_pickle('filtered.pkl')

answers = []

min_distances = [np.inf] * len(input_locations)
answers = [{}] * len(input_locations)

for idx,row in df.iterrows():
    
    for lidx, place in enumerate(input_locations):
        
        distance = ((place['lon']-row['lon']) ** 2 + (place['lat']-row['lat']) ** 2) ** 0.5
        
        if distance < min_distances[lidx]:
            min_distances[lidx] = distance
            answers[lidx] = row[['lon','lat','name']].to_dict()

json.dump(answers,open('outputs.json','w'))

In [None]:
%%cython

cdef int a = 0
for i in range(10):
    a += i
print(a)

In [None]:
%%time
import pandas as pd
import numpy as np
import json

input_locations = json.load(open('inputs.json', 'r'))

df = pd.read_pickle('filtered.pkl')

answers = []

min_distances = [np.inf] * len(input_locations)
answers = [{}] * len(input_locations)

for idx,row in df.iterrows():
    
    for lidx, place in enumerate(input_locations):
        
        distance = ((place['lon']-row['lon']) ** 2 + (place['lat']-row['lat']) ** 2) ** 0.5
        
        if distance < min_distances[lidx]:
            min_distances[lidx] = distance
            answers[lidx] = dict(row[['lon','lat','name']])

json.dump(answers,open('outputs.json','w'))