# Housing Days On Market Wrangling For Geo-Coordinates

## Python Package(s) Used

In [None]:
from geopy.distance import vincenty
import numpy as np
import pandas as pd
import time

## Data Fetching

In [None]:
# Read the main file into DataFrames
df_main = pd.read_csv('output_csv-1-7-17.csv')

# Read the school files into DataFrames
df_public_school = pd.read_csv('Public_Schools.csv')
df_ind_school = pd.read_csv('Independent_Schools.csv')
df_cap_gain_school = pd.read_csv('Capital_Gains_Schools.csv')
# Lat/Long not included in this file
#df_charter_school = pd.read_csv('Charter_Schools.csv')

# Read the grocery store file into DataFrames
df_grocery = pd.read_csv('Grocery_Store_Locations.csv')

# Read the Metro data files into DataFrames
df_metro_bus = pd.read_csv('Metro_Bus_Stops.csv')
df_metro_station = pd.read_csv('Metro_Station_Entrances_Regional.csv')

## Data Wrangling

In [None]:
df_main.head()

In [None]:
# Rename columns to aid in matching/calculations
df_ind_school = df_ind_school.rename(columns = {'X':'Longitude'})
df_ind_school = df_ind_school.rename(columns = {'Y':'Latitude'})

df_grocery = df_grocery.rename(columns = {'X':'Longitude'})
df_grocery = df_grocery.rename(columns = {'Y':'Latitude'})

df_metro_station = df_metro_station.rename(columns = {'X':'Longitude'})
df_metro_station = df_metro_station.rename(columns = {'Y':'Latitude'})

In [None]:
# Zipping lat/long together in respective dataframes for calculations
df_main['coord'] = zip(df_main['PropertyLatitude'],df_main['PropertyLongitude'])

df_public_school['coord'] = zip(df_public_school['LATITUDE'],df_public_school['LONGITUDE'])
df_ind_school['coord'] = zip(df_ind_school['Latitude'],df_ind_school['Longitude'])
df_cap_gain_school['coord'] = zip(df_cap_gain_school['LATITUDE'],df_cap_gain_school['LONGITUDE'])

df_grocery['coord'] = zip(df_grocery['Latitude'],df_grocery['Longitude'])
df_metro_bus['coord'] = zip(df_metro_bus['LATITUDE'],df_metro_bus['LONGITUDE'])
df_metro_station['coord'] = zip(df_metro_station['Latitude'],df_metro_station['Longitude'])

In [None]:
df_main_distances = df_main[['Address','PropertyLongitude','PropertyLatitude']]

In [None]:
# Shortest distance mapping for grocery stores
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_grocery['coord']]
    lst_distance.append(min(y))
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))   

df_main_distances['distance_grocery_km'] = lst_distance
df_main_distances['count_grocery_km'] = lst_count
df_main_distances.to_csv('df_main_distances_grocery.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start) 

In [None]:
# Shortest distance mapping for metro station entrances
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_metro_station['coord']]
    lst_distance.append(min(y))
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_metro_station_km'] = lst_distance
df_main_distances['count_metro_station_km'] = lst_count
df_main_distances.to_csv('df_main_distances_metro_station.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
df_public_school_active = df_public_school[df_public_school['STATUS']=='Active']

In [None]:
df_public_school_active['FACUSE'].unique()

In [None]:
df_public_school_active.replace(['High School//Specialized'], ['High School/Specialized'], inplace=True)

In [None]:
df_public_school_active['FACUSE'].unique()

In [None]:
df_public_school_mid = df_public_school_active[df_public_school_active['FACUSE']=='Middle School']
df_public_school_elem = df_public_school_active[df_public_school_active['FACUSE']=='Elementary School']
df_public_school_ye = df_public_school_active[df_public_school_active['FACUSE']=='Youth Engagement']   
df_public_school_high = df_public_school_active[df_public_school_active['FACUSE']=='High School']
df_public_school_high_specialized = df_public_school_active[df_public_school_active['FACUSE']=='High School/Specialized']
df_public_school_edu_campus = df_public_school_active[df_public_school_active['FACUSE']=='Education Campus']
df_public_school_arts_center = df_public_school_active[df_public_school_active['FACUSE']=='Arts Center']
df_public_school_elem_specialized = df_public_school_active[df_public_school_active['FACUSE']=='Elementary School/Specialized']
df_public_school_special_ed = df_public_school_active[df_public_school_active['FACUSE']=='Special Education']

In [None]:
# Shortest distance mapping for middle schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_mid['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_mid_km'] = lst_distance
df_main_distances['count_public_school_mid_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_mid.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for elementary schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_elem['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_elem_km'] = lst_distance
df_main_distances['count_public_school_elem_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_elem.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for youth engagement schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_ye['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_ye_km'] = lst_distance
df_main_distances['count_public_school_ye_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_ye.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for high schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_high['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_high_km'] = lst_distance
df_main_distances['count_public_school_high_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_high.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for specialized high schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_high_specialized['coord']]    
    lst_distance.append(min(y))   
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_high_specialized_km'] = lst_distance
df_main_distances['count_public_school_high_specialized_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_high_specialized.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for education campus schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_edu_campus['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_edu_campus_km'] = lst_distance
df_main_distances['count_public_school_edu_campus_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_edu_campus.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for arts centers schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_arts_center['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_arts_center_km'] = lst_distance
df_main_distances['count_public_school_arts_center_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_arts_center.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for specialized elementary schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_elem_specialized['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_elem_specialized_km'] = lst_distance
df_main_distances['count_public_school_elem_specialized_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_elem_specialized.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for special education schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_public_school_special_ed['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_public_school_special_ed_km'] = lst_distance
df_main_distances['count_public_school_special_ed_km'] = lst_count
df_main_distances.to_csv('df_main_distances_public_school_special_ed.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for capital gains schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_cap_gain_school['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_cap_gain_school_km'] = lst_distance
df_main_distances['count_cap_gain_school_km'] = lst_count
df_main_distances.to_csv('df_main_distances_cap_gain_school.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
# Shortest distance mapping for independent schools
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_ind_school['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_ind_school_km'] = lst_distance
df_main_distances['count_ind_school_km'] = lst_count
df_main_distances.to_csv('df_main_distances_ind_school.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
### This one is expected to take a very long time (51,000 addresses*10,500 bus stops)
# Shortest distance mapping for metro bus stops
lst_distance = []
lst_count = []

# Start clock for run time
start = time.time()

for i in df_main['coord']:
    y = [vincenty(i, x).km for x in df_metro_bus['coord']]    
    lst_distance.append(min(y))    
    z = [a for a in y if a <= 1]
    lst_count.append(len(z))    

df_main_distances['distance_metro_bus_km'] = lst_distance
df_main_distances['count_metro_bus_km'] = lst_count
df_main_distances.to_csv('df_main_distances_metro_bus.csv')

# Print run time
print "\nProcessing took {:0.3f} seconds\n".format(time.time()-start)

In [None]:
df_main_distances['distance_metro_bus_km'].head()

In [None]:
df_main_distances.to_csv('df_main_distances.csv')