In [25]:
import pandas as pd
import numpy as np
import geopy.distance
from haversine import haversine

In [26]:
dfPlaces = pd.read_csv('../data/externalData/placesOfInterest.csv', encoding='utf-8')
dfTrain = pd.read_csv('../data/TRAIN_TEST_corrected2/train_corrected2.csv')

In [27]:
dfPlaces.tail(10)

Unnamed: 0,place_id,lat,lon,name,topic,centroid
2389,ChIJR_mVyca1vJURBWmPMGrA41c,-34.570247,-58.444337,Olleros,SUBTES,CABA
2390,ChIJ9YiBmoLLvJURTgPWJPEA1LE,-34.659502,-58.418041,Espacio Verde Ezequiel Demonty,PARQUES,SUR
2391,ChIJ7RNuPKS1vJURiR7kLC7sLro,-34.563292,-58.424077,Cnel. Jordan C. Wysocki Park,PARQUES,OESTE
2392,ChIJ0edzkJ7JvJURV50M-djr97E,-34.644294,-58.476915,Calesita Parque Nicolás Avellaneda,PARQUES,CABA
2393,ChIJRzAK7Zy2vJURHmHHuYWAJv4,-34.550454,-58.467112,Juana Azurduy,METROBUS,CABA
2394,ChIJp6o51pjKvJURJHk8cSWRqpc,-34.588154,-58.398162,Las Heras,SUBTES,OESTE
2395,ChIJkyFHrHe1vJURd0a3ENmtgyg,-34.57496,-58.415052,Monumento a La Carta Magna y las Cuatro Region...,TURISMO,NORTE
2396,ChIJC-COMCjLvJURvGt0t-UtDkY,-34.618231,-58.38158,Independencia,SUBTES,OESTE
2397,ChIJwUyDAjM1o5URhqe69AlgVfM,-34.599536,-58.397953,School of Medicine,SUBTES,NORTE
2398,ChIJ3Y0AxjM1o5URQUk53zruw3o,-34.600735,-58.36684,Tourist Assistance Center - Puerto Madero,TURISMO,NORTE


In [28]:
def _entornoScore(puntaje, d, dmax) :
    
    if (d > dmax) :
        return 0
    
    exp = 3/float(2)
    return puntaje * (1 - (d/float(dmax))**exp )

In [29]:
def entornoScore(lat, lon, matrix, puntajes) :
    
    LAT, LON, TOPIC = (1, 2, 4)
    score = 1   # Los scores comienzan en 1
    
    for row in matrix :
        topic = row[TOPIC]
        lat_ = row[LAT]
        lon_ = row[LON]
        
        pje = puntajes[topic]['puntaje']
        dist_max = puntajes[topic]['dist_max_mts']
        
        dist_a_pto_interes = haversine( (lat_,lon_), (lat, lon) ) * 1000.0
        score += _entornoScore(pje, dist_a_pto_interes, dist_max)
    
    return score

In [61]:
def makeScoresForIds(df) :    
    
    # PUNTAJES:
    puntajes = { 'SUBTES'    : { 'puntaje':80,  'dist_max_mts':500 }, 
                 'SHOPPINGS' : { 'puntaje':100, 'dist_max_mts':900 }, 
                 'METROBUS'  : { 'puntaje':40,  'dist_max_mts':300 }, 
                 'TRENES'    : { 'puntaje':65,  'dist_max_mts':500 }, 
                 'PARQUES'   : { 'puntaje':30,  'dist_max_mts':300 }, 
                 'TURISMO'   : { 'puntaje':20,  'dist_max_mts':400 }, 
                 'SEGURIDAD' : { 'puntaje':45,  'dist_max_mts':250 }
               }
    
    # Inicializo todos los scores en NaN
    id_scores = dict([ (row, np.nan) for row in df.head(100).id ])
    ID, lat, lon = (0, 5, 6)
    
    big_array = df.as_matrix()
    small_array = dfPlaces.as_matrix()
    
    # Itero el dataframe grande
    size = len(big_array)
    i = 0
    porcentaje = 0
    
    for row in big_array :
        id_scores[ row[ID] ] = entornoScore(row[lat], row[lon], small_array, puntajes)
        i += 1
        new_porcentaje = (i*100.0) / size
        
        if (abs(porcentaje - new_porcentaje) >= 1) :
            porcentaje = new_porcentaje        
    
    return pd.DataFrame(id_scores.items(), columns=['id', 'entorno_score'])

In [62]:
dfScores = makeScoresForIds(dfTrain)

1.00002289065 %	2.0000457813 %	3.00006867195 %	4.00009156261 %	5.00011445326 %	6.00013734391 %	7.00016023456 %	8.00018312521 %	9.00020601586 %	10.0002289065 %	11.0002517972 %	12.0002746878 %	13.0002975785 %	14.0003204691 %	15.0003433598 %	16.0003662504 %	17.0003891411 %	18.0004120317 %	19.0004349224 %	20.000457813 %	21.0004807037 %	22.0005035943 %	23.000526485 %	24.0005493756 %	25.0005722663 %	26.0005951569 %	27.0006180476 %	28.0006409382 %	29.0006638289 %	30.0006867195 %	31.0007096102 %	32.0007325008 %	33.0007553915 %	34.0007782821 %	35.0008011728 %	36.0008240634 %	37.0008469541 %	38.0008698448 %	39.0008927354 %	40.0009156261 %	41.0009385167 %	42.0009614074 %	43.000984298 %	44.0010071887 %	45.0010300793 %	46.00105297 %	47.0010758606 %	48.0010987513 %	49.0011216419 %	50.0011445326 %	51.0011674232 %	52.0011903139 %	53.0012132045 %	54.0012360952 %	55.0012589858 %	56.0012818765 %	57.0013047671 %	58.0013276578 %	59.0013505484 %	60.0013734391 %	61.0013963297 %	62.0014192204 %	63.001442111 %

In [63]:
dfScores.to_csv('../data/externalData/scoresOfInterest.csv', index=False)

In [83]:
dfScores.head(10)

Unnamed: 0,id,entorno_score
0,0354e83b801ddacbb4cc623d412e932ed078c0a6,369.221623
1,1771e4252915a911b8eb4964cde2790d5623bd33,1.0
2,5afa082f4f2ab589393f90ed8f4195f19693fe15,617.9441
3,42144fb9c20cb5903a13119049e770f31f4ff77f,1.0
4,7ea6baad8af2dafe5f563128a8d027905c64ac0e,1.0
5,6e78a58391e01ef7ea978e256a02943437096a4c,315.243884
6,486c44c6ae60cc40aa7aac085bf20a83a3e68e0e,1.0
7,3310de102f192c4a03a415e3f648799caef02da7,242.498491
8,005481f73ee4472d5c42f326537cc319af2a5f40,742.460857
9,85a35d84cce2d042518af471cb91101df8379881,1.0
