In [1]:
import pandas as pd
import spacy
import os
from tqdm import tqdm
import numpy as np
from collections import Counter

## class part
import overpy
from geopy.geocoders import Nominatim
import math
from geopy.distance import geodesic
from uszipcode import SearchEngine


In [2]:
class location_processor():
  def __init__(self):
    self.api = overpy.Overpass()
    pass

  ## Function that returns closest distnce and number of the facilities
  def NearbyDisAndNum(self,lat, lon, query):
      if lat == None:
        return (None, None)
      else:
        result = self.api.query(query.format(lat, lon))
        closest_distance = math.inf
        for node in result.nodes:
            distance = math.sqrt((float(node.lat) - lat)**2 + (float(node.lon) - lon)**2)
            if distance < closest_distance:
                closest_distance = distance
        return (closest_distance,len(result.nodes))

  ## Function that returns numbers of the facilities
  def NearbyNum(self,lat, lon, query):
      if lat == None:
        result = self.api.query(query.format(lat, lon))
        return len(result.nodes)
      else:
        return None

  ## Function that find zipcode of inputted address
  def get_zipcode(self,lat, lon):
      search = SearchEngine()
      result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
      RealEstateByZip = pd.read_pickle('./utiles/RealEstateByZip.pkl')
      try :
          zip = result[0].zipcode
          real_estate = RealEstateByZip[RealEstateByZip["zipcode"]==int(zip)].iat[0,1]
          return real_estate
      except IndexError:
          return None

  ## Calculate mean(Price) of same accommodates or beds 
  def average_cal(self,flg, num,lat,lon, data_for_cal):
    if num == None or lat == None:
      return None
    
    if flg == "accommodates":
        data_temp = data_for_cal[data_for_cal["accommodates"]==num]
    elif flg == "beds":
        data_temp = data_for_cal[data_for_cal["beds"]==num]
        data_temp = data_temp[(data_temp["latitude"] > lat - 0.01) & (data_temp["latitude"] < lat + 0.01) ]
        data_temp = data_temp[(data_temp["longitude"] > lon - 0.01) & (data_temp["longitude"] < lon + 0.01) ]
    
    return data_temp["price"].mean()

  ## Calculate distance between other two point(lat, lon)
  def distance(self,airbnb_lat, airbnb_lon, sight_lat, sight_lon):
      if airbnb_lat == None:
        return None
      else:
        dis = round(geodesic((airbnb_lat, airbnb_lon), (sight_lat, sight_lon)).km, 2)
        return dis

  def process_airbnb_data(self,df):
      # from google.colab import drive 
      # drive.mount('/content/drive')
      df_temp = pd.read_pickle('./utiles/area_features.pkl')


      extracted_features = pd.merge(df["id"], df_temp.drop(["latitude","longitude"], axis = 1), on = "id", how = "left")
      col = extracted_features.columns
      loc_col = ["Location_" + i for i in col ]
      loc_col[0] = "id"
      extracted_features.columns = loc_col

      return extracted_features
  

  ## Making dict that inculdes all features
  def process_new_data(self, address, ac = None, beds = None):
      ## Set a dict that keeps features calculated by the functions

      if address == None:
        result_dict = {}
      else:
        result_dict = {}
        geolocator = Nominatim(user_agent="gmt89")
        location = geolocator.geocode(address)
        lat = location.latitude
        lon = location.longitude

      ## Set queries that are used by the function "NearbyDisAndNum"
      transport = {
          "bus_stop_500m" : 'node["highway"="bus_stop"](around:500, {0}, {1}); out;',
          "bus_stop_1000m" : 'node["highway"="bus_stop"](around:1000, {0}, {1}); out;',
          "station_500m" : 'node[railway=station](around:500, {0}, {1}); out;',
          "station_1000m" : 'node[railway=station](around:1000, {0}, {1}); out;',
          "cafe_500m": 'node["amenity"="cafe"](around:500, {0}, {1}); out;'
      }

      ## Extract a distance and number 

      for key, value in transport.items():
        if address == None:
            result_dict["Location_"+key+"_dis"] = None
            result_dict["Location_" + key+"_num"] = None
        else:          
          output = self.NearbyDisAndNum(lat,lon, value)
          dis = output[0]
          num = output[1]
          result_dict["Location_"+key+"_dis"] = dis*111
          result_dict["Location_" + key+"_num"] = num
        


      ## Set queries that are used by the function "NearbyNum"
      facility = {
          "restaurant":'node[amenity=restaurant](around:1000, {0}, {1}); out;',
          "supermarket":'node["shop"="supermarket"](around:1000, {0}, {1}); out;'
      }

      ## Extract number of facilities by the function "NearbyNum" 

      for key, value in facility.items():
        if address == None:
          result_dict["Location_" + key+"_num"] = None
        else:          
          output = self.NearbyNum(lat,lon, value)
          result_dict["Location_" + key+"_num"] = output

      ## Extract real estate average price
      if address == None:
        result_dict["Location_real_estate"]  = None
      else:
        result_dict["Location_real_estate"]  = self.get_zipcode(lat, lon)

      ## To calculate mean() of price within same accommodates and beds

      #Preparing master dataset
      data = pd.read_pickle('./utiles/data_with_zip.pkl')

      data_for_cal = data[["id","latitude","longitude","accommodates","beds","price"]]

      #Insert mean of price to result_dict 
      if address ==None:
        result_dict["Location_mean_area_accommodates_price"] = None
        result_dict["Location_mean_area_beds_price"] = None
      else:
        result_dict["Location_mean_area_accommodates_price"] = self.average_cal("accommodates",ac, lat, lon, data_for_cal)
        result_dict["Location_mean_area_beds_price"] = self.average_cal("beds",beds, lat, lon, data_for_cal)

      #Preparing master sightseeng dataset
      df_sightseeing = pd.read_pickle('./utiles/master_sightseeing.pkl')

      #Insert distance bewteen airbnb house and famous sightseeing facility
      for i in range(len(df_sightseeing)):
        if address == None:
          result_dict["Location_"+df_sightseeing.iat[i,0]] = None
        else:
          result_dict["Location_"+df_sightseeing.iat[i,0]] = self.distance(lat, lon, df_sightseeing.iat[i,2], df_sightseeing.iat[i,3])


      # result = pd.DataFrame.from_dict(result_dict)

      df_result = pd.DataFrame(result_dict,index = [0])
      if address == None:
        df_result["Location_transport_most_close_dis"] = None
        df_result["Location_transport_1000m_num"] =  None
        df_result["Location_transport_500m_num"] =  None
      else:       
        df_result["Location_transport_most_close_dis"] = df_result.apply(lambda x : min([x["Location_bus_stop_500m_dis"],x["Location_bus_stop_1000m_dis"],x["Location_station_1000m_dis"],x["Location_station_500m_dis"]]), axis=1)
        df_result["Location_transport_1000m_num"] =  df_result.apply(lambda x : sum([x["Location_bus_stop_1000m_num"],x["Location_station_1000m_num"]]),axis=1)
        df_result["Location_transport_500m_num"] =  df_result.apply(lambda x : sum([x["Location_bus_stop_500m_num"],x["Location_station_500m_num"]]),axis=1)
      df_result = df_result.drop(["Location_bus_stop_500m_dis","Location_bus_stop_500m_num", "Location_bus_stop_1000m_dis",
                                  "Location_bus_stop_1000m_num","Location_station_500m_dis","Location_station_500m_num",
                                  "Location_station_1000m_dis","Location_station_1000m_num"], axis = 1)
      
      return df_result 

    

In [3]:
processor = location_processor()

In [4]:
import pickle
df = pickle.load(open('./utiles/area_features.pkl','rb'))

In [5]:
res = processor.process_airbnb_data(df)

In [6]:
res

Unnamed: 0,id,Location_supermarket_num,Location_restaurant_num,Location_cafe_500m_dis,Location_cafe_500m_num,Location_transport_most_close_dis,Location_transport_500m_num,Location_transport_1000m_num,Location_Hollywood Walk of Fame,Location_Griffith Observatory,...,Location_Melrose Avenue,Location_ACMA,Location_Rodeo Drive,Location_Santa Monica Promenade,Location_Farmers Market,Location_Abbot Kinney Blvd,Location_Los Angeles International Airport,Location_mean_area_accommodates_price,Location_mean_area_beds_price,Location_real_estate
0,65467,3,35,0.264829,3,0.193192,6,46,9.46,12.95,...,6.39,4.96,5.46,10.55,5.70,8.31,52.78,136.350876,130.241379,1663967.3247817142
1,206662,0,32,0.453008,1,0.301758,8,36,0.72,4.62,...,3.96,4.55,6.43,16.90,3.77,16.66,60.77,107.387100,183.716812,1490491.188123733
2,67089,1,36,0.323650,3,0.190330,10,58,11.94,15.60,...,8.38,7.51,6.74,8.32,8.18,5.58,51.84,160.285721,120.010201,1663967.3247817142
3,210489,2,20,0.362765,1,0.118904,13,37,7.35,11.31,...,3.44,3.43,2.14,10.33,3.79,9.53,56.10,506.333344,339.000000,2330891.742962242
4,109,1,8,inf,0,0.090714,13,46,13.82,16.96,...,10.99,9.35,9.86,10.97,10.15,7.11,48.41,213.000000,239.399994,1043530.198522237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40433,774860592421665035,3,3,inf,0,0.035223,13,52,1.24,5.08,...,3.51,3.27,5.95,16.34,2.56,15.75,59.29,109.520287,117.845741,1068719.7443117255
40434,774864578948159716,2,0,inf,0,0.185208,6,31,16.29,19.89,...,12.67,11.83,10.82,7.71,12.52,3.30,48.92,184.172409,224.250000,1251735.3471947794
40435,774866706304798119,0,0,inf,0,0.289222,12,26,9.86,11.69,...,9.34,6.64,9.92,16.33,7.36,13.41,50.54,88.857140,153.956528,823735.0437999574
40436,774872573559544018,1,6,0.122973,1,0.127720,19,56,13.81,17.08,...,10.79,9.32,9.52,10.25,10.10,6.42,48.83,174.600006,146.272720,1043530.198522237


In [8]:
new = processor.process_new_data('test')

In [12]:
set(res.columns) - set(new.columns)

{'id'}

In [13]:
set(new.columns) - set(res.columns)

set()

In [14]:
new

Unnamed: 0,Location_cafe_500m_dis,Location_cafe_500m_num,Location_restaurant_num,Location_supermarket_num,Location_real_estate,Location_mean_area_accommodates_price,Location_mean_area_beds_price,Location_Hollywood Walk of Fame,Location_Griffith Observatory,Location_Universal Studios Hollywood,...,Location_Melrose Avenue,Location_ACMA,Location_Rodeo Drive,Location_Santa Monica Promenade,Location_Farmers Market,Location_Abbot Kinney Blvd,Location_Los Angeles International Airport,Location_transport_most_close_dis,Location_transport_1000m_num,Location_transport_500m_num
0,inf,0,,,,,,12381.03,12378.59,12377.14,...,12383.81,12385.4,12385.42,12392.72,12384.59,12395.29,12438.34,inf,0,0
