In [11]:
pip install overpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
pip install uszipcode

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
## class part
import overpy
from geopy.geocoders import Nominatim
import math
from geopy.distance import geodesic
import pandas as pd

from uszipcode import SearchEngine

class location_processor():
  def __init__(self):
    self.api = overpy.Overpass()
    pass

  ## Function that returns closest distnce and number of the facilities
  def NearbyDisAndNum(self,lat, lon, query):
      if lat == None:
        return (None, None)
      else:
        result = self.api.query(query.format(lat, lon))
        closest_distance = math.inf
        for node in result.nodes:
            distance = math.sqrt((float(node.lat) - lat)**2 + (float(node.lon) - lon)**2)
            if distance < closest_distance:
                closest_distance = distance
        return (closest_distance,len(result.nodes))

  ## Function that returns numbers of the facilities
  def NearbyNum(self,lat, lon, query):
      if lat == None:
        result = self.api.query(query.format(lat, lon))
        return len(result.nodes)
      else:
        return None

  ## Function that find zipcode of inputted address
  def get_zipcode(self,lat, lon):
      search = SearchEngine()
      result = search.by_coordinates(lat = lat, lng = lon, returns = 1)
      RealEstateByZip = pd.read_pickle('/content/drive/MyDrive/Capstone/Scripts/Location_part/Data/RealEstateByZip.pkl')
      try :
          zip = result[0].zipcode
          real_estate = RealEstateByZip[RealEstateByZip["zipcode"]==int(zip)].iat[0,1]
          return real_estate
      except IndexError:
          return None

  ## Calculate mean(Price) of same accommodates or beds 
  def average_cal(self,flg, num,lat,lon, data_for_cal):
    if num == None or lat == None:
      return None
    
    if flg == "accommodates":
        data_temp = data_for_cal[data_for_cal["accommodates"]==num]
    elif flg == "beds":
        data_temp = data_for_cal[data_for_cal["beds"]==num]
        data_temp = data_temp[(data_temp["latitude"] > lat - 0.01) & (data_temp["latitude"] < lat + 0.01) ]
        data_temp = data_temp[(data_temp["longitude"] > lon - 0.01) & (data_temp["longitude"] < lon + 0.01) ]
    
    return data_temp["price"].mean()

  ## Calculate distance between other two point(lat, lon)
  def distance(self,airbnb_lat, airbnb_lon, sight_lat, sight_lon):
      if airbnb_lat == None:
        return None
      else:
        dis = round(geodesic((airbnb_lat, airbnb_lon), (sight_lat, sight_lon)).km, 2)
        return dis

  def process_airbnb_data(self,df):
      # from google.colab import drive 
      # drive.mount('/content/drive')
      df_temp = pd.read_pickle('/content/drive/MyDrive/Capstone/Scripts/Location_part/Data/area_features.pkl')


      extracted_features = pd.merge(df["id"], df_temp.drop(["latitude","longitude"], axis = 1), on = "id", how = "left")
      col = extracted_features.columns
      loc_col = ["Location_" + i for i in col ]
      loc_col[0] = "id"
      extracted_features.columns = loc_col

      return extracted_features
  

  ## Making dict that inculdes all features
  def process_new_data(self, address, ac = None, beds = None):
      ## Set a dict that keeps features calculated by the functions

      if address == None:
        result_dict = {}
      else:
        result_dict = {}
        geolocator = Nominatim(user_agent="gmt89")
        location = geolocator.geocode(address)
        lat = location.latitude
        lon = location.longitude

      ## Set queries that are used by the function "NearbyDisAndNum"
      transport = {
          "bus_stop_500m" : 'node["highway"="bus_stop"](around:500, {0}, {1}); out;',
          "bus_stop_1000m" : 'node["highway"="bus_stop"](around:1000, {0}, {1}); out;',
          "station_500m" : 'node[railway=station](around:500, {0}, {1}); out;',
          "station_1000m" : 'node[railway=station](around:1000, {0}, {1}); out;',
          "cafe_500m": 'node["amenity"="cafe"](around:500, {0}, {1}); out;'
      }

      ## Extract a distance and number 

      for key, value in transport.items():
        if address == None:
            result_dict["Location_"+key+"_dis"] = None
            result_dict["Location_" + key+"_num"] = None
        else:          
          output = self.NearbyDisAndNum(lat,lon, value)
          dis = output[0]
          num = output[1]
          result_dict["Location_"+key+"_dis"] = dis*111
          result_dict["Location_" + key+"_num"] = num
        


      ## Set queries that are used by the function "NearbyNum"
      facility = {
          "restaurant":'node[amenity=restaurant](around:1000, {0}, {1}); out;',
          "supermarket":'node["shop"="supermarket"](around:1000, {0}, {1}); out;'
      }

      ## Extract number of facilities by the function "NearbyNum" 

      for key, value in facility.items():
        if address == None:
          result_dict["Location_" + key+"_num"] = None
        else:          
          output = self.NearbyNum(lat,lon, value)
          result_dict["Location_" + key+"_num"] = output

      ## Extract real estate average price
      if address == None:
        result_dict["Location_real_estate"]  = None
      else:
        result_dict["Location_real_estate"]  = self.get_zipcode(lat, lon)

      ## To calculate mean() of price within same accommodates and beds

      #Preparing master dataset
      data = pd.read_pickle('/content/drive/MyDrive/Capstone/Scripts/Location_part/Data/data_with_zip.pkl')

      data_for_cal = data[["id","latitude","longitude","accommodates","beds","price"]]

      #Insert mean of price to result_dict 
      if address ==None:
        result_dict["Location_mean_area_accommodates_price"] = None
        result_dict["Location_mean_area_beds_price"] = None
      else:
        result_dict["Location_mean_area_accommodates_price"] = self.average_cal("accommodates",ac, lat, lon, data_for_cal)
        result_dict["Location_mean_area_beds_price"] = self.average_cal("beds",beds, lat, lon, data_for_cal)

      #Preparing master sightseeng dataset
      df_sightseeing = pd.read_pickle('/content/drive/MyDrive/Capstone/Scripts/Location_part/Data/master_sightseeing.pkl')

      #Insert distance bewteen airbnb house and famous sightseeing facility
      for i in range(len(df_sightseeing)):
        if address == None:
          result_dict["Location_"+df_sightseeing.iat[i,0]] = None
        else:
          result_dict["Location_"+df_sightseeing.iat[i,0]] = self.distance(lat, lon, df_sightseeing.iat[i,2], df_sightseeing.iat[i,3])


      # result = pd.DataFrame.from_dict(result_dict)

      df_result = pd.DataFrame(result_dict,index = [0])
      if address == None:
        df_result["Location_transport_most_close_dis"] = None
        df_result["Location_transport_1000m_num"] =  None
        df_result["Location_transport_500m_num"] =  None
      else:       
        df_result["Location_transport_most_close_dis"] = df_result.apply(lambda x : min([x["Location_bus_stop_500m_dis"],x["Location_bus_stop_1000m_dis"],x["Location_station_1000m_dis"],x["Location_station_500m_dis"]]), axis=1)
        df_result["Location_transport_1000m_num"] =  df_result.apply(lambda x : sum([x["Location_bus_stop_1000m_num"],x["Location_station_1000m_num"]]),axis=1)
        df_result["Location_transport_500m_num"] =  df_result.apply(lambda x : sum([x["Location_bus_stop_500m_num"],x["Location_station_500m_num"]]),axis=1)
      df_result = df_result.drop(["Location_bus_stop_500m_dis","Location_bus_stop_500m_num", "Location_bus_stop_1000m_dis",
                                  "Location_bus_stop_1000m_num","Location_station_500m_dis","Location_station_500m_num",
                                  "Location_station_1000m_dis","Location_station_1000m_num"], axis = 1)
      
      return df_result 

    

In [56]:
## Confirming process_new_data

from google.colab import drive 
drive.mount('/content/drive')

loc = location_processor()

address = "200 N Spring St. Los Angeles"

accommodates = None
beds = 4

result_new = loc.process_new_data(address, accommodates, beds)
result_new

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# Confirming process_airbnb_data
loc = location_processor()
result_list = loc.process_airbnb_data(df)

result_list