In [10]:
import math
import numpy as np
import pandas as pd

## distance

In [2]:
## returns straight-line (Euclidean) distance in km
def distance(address, landmark):
    lat1, lon1 = address
    lat2, lon2 = landmark
    p = 0.017453292519943295;    # Math.PI / 180
    a = 0.5 - math.cos((lat2 - lat1) * p) / 2 + math.cos(lat1 * p) * math.cos(lat2 * p) * (1 - math.cos((lon2 - lon1) * p)) / 2;

    return 12742 * math.asin(math.sqrt(a)) # 2 * R; R = 6371 km

In [3]:
# test function -- CCTC to Pierpont
distance((42.278165, -83.734749), (42.291151, -83.717377))

2.031583762765907

In [4]:
# load data
df = pd.read_csv('../data/geocoded_data/coords.csv', lineterminator='\n').drop('Unnamed: 0', axis=1).rename(columns={'latitude': 'lat', 'longitude': 'lon'})

In [5]:
# drop entries with no price or coordinates
df = df[df['address'].notnull() & df['lat'].notnull()]

In [6]:
landmarks = ['CCTC', 'Pierpont', 'Stadium', 'Union', 'Ross', 'IM', 'NCRB', 'CCRB', 'UgLi']
landmark_coords = [(42.278165, -83.734749), 
                   (42.291151, -83.717377), 
                   (42.265865, -83.748684),
                   (42.275177, -83.741517),
                   (42.278046, -83.738220),
                   (42.269382, -83.749068),
                   (42.295156, -83.714627),
                   (42.281104, -83.733603),
                   (42.275654, -83.737176)]

for landmark, coord in zip(landmarks, landmark_coords):
    s = f'distance_to_{landmark}'
    addresses = zip(df.lat, df.lon)
    dist = []
    for address in addresses:
        dist.append(distance(address, coord))
    df[f'distance_to_{landmark}'] = dist

In [7]:
df.head()

Unnamed: 0,address,price,bed,bath,area,company,neighborhood,laundry,pets,parking,...,lon,distance_to_CCTC,distance_to_Pierpont,distance_to_Stadium,distance_to_Union,distance_to_Ross,distance_to_IM,distance_to_NCRB,distance_to_CCRB,distance_to_UgLi
0,3050 Birch Hollow Dr,1095.0,2.0,1.0,876,,,1.0,1.0,1.0,...,-83.718505,4.557801,5.80206,3.886862,4.448382,4.637215,4.213861,6.254798,4.846131,4.358015
1,912 Rose Ave,2000.0,4.0,2.5,2000,,,1.0,1.0,1.0,...,-83.738165,1.762333,3.614058,0.942182,1.434291,1.726546,1.177845,4.113428,2.100388,1.462828
3,1505 Natalie Lane near Kipling Drive,1385.0,1.0,1.0,800,,,1.0,1.0,1.0,...,-83.683362,4.577147,2.815139,6.21557,5.21968,4.846902,6.056612,2.575089,4.372745,4.87206
4,1505 Natalie Lane near Kipling Drive,1185.0,1.0,1.0,800,,,1.0,1.0,1.0,...,-83.683362,4.577147,2.815139,6.21557,5.21968,4.846902,6.056612,2.575089,4.372745,4.87206
6,1505 Natalie Lane near Kipling Drive,1375.0,1.0,1.0,800,,,1.0,1.0,0.0,...,-83.683362,4.577147,2.815139,6.21557,5.21968,4.846902,6.056612,2.575089,4.372745,4.87206


## neighborhood

In [8]:
import geopandas as gp
from shapely.geometry import Point, Polygon, shape

gs = gp.GeoSeries.from_file('../map/map2.geojson')
names = ['North Ingalls', 
         'Old Fourth Ward', 
         'Old West Side', 
         'Germantown', 
         'Elbel', 
         'Yost', 
         'East Packard', 
         'Tappan',
         'North Burns Park', 
         'Oxbridge',
         'South University',
         'West Murfin',
         'Northside']

In [11]:
def calc_neighborhood(listing):
    pt = Point(listing['lon'], listing['lat']) 
    for nbhd, name in zip(gs, names):
        nbhd = shape(nbhd)
        if pt.within(nbhd):
            return name

    return 'Other Surrounding Areas'        

neighborhoods = []
for i, listing in df.iterrows():
    neighborhoods.append(calc_neighborhood(listing))

df['neighborhood'] = np.array(neighborhoods)

In [15]:
df.groupby(['neighborhood'].unique()

array(['Other Surrounding Areas', 'Old Fourth Ward', 'Old West Side',
       'Germantown', 'Northside', 'North Ingalls', 'North Burns Park',
       'Elbel', 'South University', 'East Packard', 'Oxbridge', 'Tappan',
       'Yost', 'West Murfin'], dtype=object)

In [74]:
# save new data
df.to_json('../data/model_data/data.json')