<a href="https://colab.research.google.com/github/dylanjrt/blueberry/blob/main/blueberry_for_GitHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## House keeping

In [None]:
# Distance
!pip install haversine
!pip install python-google-places
!pip install -U googlemaps
from haversine import haversine, Unit
from googleplaces import GooglePlaces
import random
import googlemaps

api_key = API_KEY

# Data Manipulation
import requests
import pandas as pd
import numpy as np
import csv

# Region Identification
import json
from shapely.geometry import shape, Point



#🍇 **blueberry** 🍇



## Regional:
First we download the regional based data including bounding polygons for niehgbourhood regions, income and population density, civics and equity, housing and safety. These attributes were selected based on their overall impact on homeless populations.


In [None]:

# CKAN Download helper
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"

def get_ckan(package):
  for idx, resource in enumerate(package["result"]["resources"]):
      if resource["datastore_active"]:
          url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search"
          p = { "id": resource["id"] , "limit": 4000}
          data = requests.get(url, params = p).json()
          df = pd.DataFrame(data["result"]["records"])
          return df
          break
          
### ADDITIONAL ATTRIBUTES

# Regional Bounding Polygons for Toronto's Neighbourhoods	

params = { "id": "4def3f65-2a65-4a4f-83c4-b2a4aed72d46"}
package = requests.get(url, params = params).json()
print(package["result"])
df_hoods = get_ckan(package)

# Gathering Income and Density values for each hood

params = { "id": "6e19a90f-971c-46b3-852c-0c48c436d1fc"}	
package = requests.get(url, params = params).json()
print(package["result"])
df_region = get_ckan(package)

df_region = df_region.transpose()
df_region = (df_region.iloc[6:]).reset_index()
df_region = df_region[[0,7,944]]
df_region.columns = ['id','density','income']
df_region['id'] = df_region['id'].astype(int)
df_region = df_region.sort_values(by = 'id')

# CIVICS & EQUITY INDICATORS
df_temp = pd.read_excel('https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/f62b0d1d-dc2d-4e0e-a9d3-aee112b9c400', sheet_name = 2)
df_temp.columns = df_temp.iloc[0]
df_temp = df_temp.rename(columns= {'Neighbourhood Id': "id"})
df_temp = df_temp.iloc[1:]

df_region = df_temp.merge(df_region, left_on="id", right_on="id", how = 'left')

# HOUSING
df_temp = pd.read_excel('https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/30aa3bdd-7c64-416b-984d-3391c2c9599a')
df_temp = df_temp.rename(columns= {'Neighbourhood': "id", "RGI":"subsidized"})

df_region = df_region.merge(df_temp, left_on="id", right_on="id", how = 'left')


# SAFETY
params = { "id": "fc4d95a6-591f-411f-af17-327e6c5d03c7"}
package = requests.get(url, params = params).json()
print(package["result"])
df_temp = get_ckan(package)
df_temp = df_temp[['Hood_ID','BreakandEnter_2019','Homicide_2019','TheftOver_2019']]
df_temp = df_temp.rename(columns= {'Hood_ID': 'id'})
df_temp['id'] = df_temp['id'].astype(int)

df_region = df_region.merge(df_temp, left_on="id", right_on="id", how = 'left')
df_region['id'] = df_region['id'].astype(int)



After appending all the datasets were are left with a 140x14 dataframe representing the 140 neighbourhood regions, and 14 features.

In [None]:
df_region.dtypes

## Record Based Data

To generate the bulk of the info for each record we needed to scrape several resource datasets, we stored each of these in the type column for future modelling. Most of the data was taken from a broad study on youth suffering from homelessness and their ressources avaiable.

In [None]:
df_main = pd.DataFrame()

# YOUTH SERVICES

# 24 Sheets
leg = pd.read_excel("https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/735c2177-513b-49dd-b4bc-6435d6a80efe/resource/5413c3d7-6c97-4437-987d-e47036f69324/download/wellbeing-toronto-youth-services-data-excel.xlsx", sheet_name = 0)
leg = leg.dropna()
l = leg['LEGEND'].str.split("\xa0 ", n = 1, expand = True)[1]

for i in range(1,24):
  df_temp = pd.read_excel("https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/735c2177-513b-49dd-b4bc-6435d6a80efe/resource/5413c3d7-6c97-4437-987d-e47036f69324/download/wellbeing-toronto-youth-services-data-excel.xlsx", sheet_name = i)
  df_temp = df_temp[['AgencyName' ,	'Address' ,	'Neighbourhood']]
  df_temp['type'] = l.iloc[i]
  df_main = df_main.append(df_temp)
  




## BULK LOADING:
params = [
    # ADULT EDUCATION UPGRADING
    { "id": "c01b9ad1-0720-4f4c-ab35-743f55756b85"},
    # SUBSTANCE USE TREATMENT
    { "id": "4db2d9d9-6590-41e6-bf2b-b9188436d044"},
    # ALTERNATIVE ADULT EDUCATION
    { "id": "9308a7e1-3781-45fd-95c7-582b2030f2c1"},
    # TRANSITIONAL HOUSING
    { "id": "cefad70f-2deb-425f-81d1-7d56cf682e65"},
    # LEGAL JUSTICE SUPPORT
    { "id":  "ca757aba-734e-4a4f-8c63-07396abcb1fd"},
    # ABORIGINAL YOUTH
    { "id": "ee43541f-220c-41f1-af52-cadf5de1dd9b"},
    # SEXUAL HEALTH
    { "id": "0edbbd59-37e4-4d43-9d79-ac7b5d24db3d"},
    # FINANCIAL SERVICES
    { "id": "8dbb3143-416c-4f2e-ab67-c4af7d2d5edf"},
    # EDUCATIONAL SERVICES
    { "id": "0edbbd59-37e4-4d43-9d79-ac7b5d24db3d"},
    # LGBTQ
    { "id": "bb40b7c9-a37d-46be-a89b-c23273d86c85"},
    # EMPLOYMENT
    { "id": "764c1564-0761-44b0-9b3a-5b2e914e66fb"},
    # MENTAL HEALTH    
    { "id": "c9f4bc42-32b0-4198-a2a0-abd26a5f2a6b"},
    # REFUGEE HOUSING
    { "id": "c9f4bc42-32b0-4198-a2a0-abd26a5f2a6b"},
    # HOUSING EVICTION HELP
    { "id": "279f11b4-aaf8-4275-b6af-fdcf679ecc2f"}
]
types = [
    "ADULT EDUCATION UPGRADING",
    "SUBSTANCE USE TREATMENT",
    "ALTERNATIVE ADULT EDUCATION",
    "TRANSITIONAL HOUSING",
    "LEGAL JUSTICE SUPPORT",
    "ABORIGINAL YOUTH",
    "SEXUAL HEALTH",
    "FINANCIAL SERVICES",
    "EDUCATIONAL SERVICES",
    "LGBTQ",
    "EMPLOYMENT",
    "MENTAL HEALTH",
    "REFUGEE HOUSING",
    "HOUSING EVICTION HELP"
]

# Dataset stacker
for i in range(len(params)):
  package = requests.get(url, params = params[i]).json()
  print(package["result"])
  df_temp = get_ckan(package)
  df_temp = df_temp[['AGENCY_NAME','ADDRESS_FULL','NEIGHBOURHOOD']]
  df_temp = df_temp.rename(columns= {'AGENCY_NAME':'AgencyName','ADDRESS_FULL':'Address','NEIGHBOURHOOD':'Neighbourhood'})
  df_temp['type'] = types[i]
  print(types[i])

  df_main = df_main.append(df_temp)

# # Extracted neighbourhood id:
# df_main['id'] = df_main['Neighbourhood'].str.split(", ", n = 1, expand = True)[1].astype(int)

# SHELTER DATA
params = { "id": "8a6eceb2-821b-4961-a29d-758f3087732d"}
package = requests.get(url, params = params).json()
print(package["result"])
df_temp = get_ckan(package)
df_temp = df_temp[['SHELTER_NAME','SHELTER_ADDRESS','SHELTER_CITY']]
df_temp = df_temp.rename(columns = {'SHELTER_NAME':'AgencyName','SHELTER_ADDRESS':'Address','SHELTER_CITY':'Neighbourhood'})
df_temp = df_temp.drop_duplicates()
df_temp['type'] = "SHELTER"

df_main = df_main.append(df_temp)

# Remove NA values:
df_main = df_main.dropna(how = 'any')

df_main = df_main.reset_index(drop=True)

## Address Attributes:

Get closest distance to nearest queries. 

In [None]:
def get_lat_lng(string_addr):
    try:
      string_addr = str(string_addr) + " Toronto Ontario"
      gmaps = googlemaps.Client(key=api_key)
      result = gmaps.geocode(string_addr)
      if result:
        return result[0]['geometry']['location']['lat'], result[0]['geometry']['location']['lng']
      return None
    except:
      return None


def get_placeID(string_addr):
    gmaps = googlemaps.Client(key=api_key)
    result = gmaps.geocode(string_addr)
    if result:
      return result[0]['place_id']
    return None

def process(lat, lng, string_addr, radius=50):
    google_places = GooglePlaces(api_key)
    query_result = get_list_loc(google_places, lat, lng, string_addr, radius)
    if query_result.places:
      place = query_result.places[0]
      place.get_details()
      return place
    return None

def get_rating(string_addr):
    try:
      lat, lng = get_lat_lng(string_addr)
    except:
      return None
    place = process(lat, lng, string_addr)
    if place:
        return place.rating
    return None


def get_nearest(string_addr, nearest):
    try:
      lat, lng = get_lat_lng(string_addr)
    except:
      return None
    nearest_item = process(lat, lng, nearest)
    if nearest_item:
      return get_distance(lat, lng, nearest_item.geo_location['lat'],
                        nearest_item.geo_location['lng'])
    return None
    

def get_list_loc(google_places, lat, lng, string_addr, radius):
    try:
      query_result = google_places.nearby_search(
          lat_lng={'lat': lat, 'lng': lng},
          keyword=string_addr,
          radius=radius,
          rankby="distance"
      )
      return query_result
    except:
      return None

def get_distance(lat, lng, dest_lat, dest_lng):
    dist_in_meters = haversine((lat, lng), (dest_lat, dest_lng), unit=Unit.METERS)
    return dist_in_meters

def get_region(lat, lng):
    pnt = Point(lng, lat)
    for _, f in df_hoods.iterrows():
      poly = shape(eval(f['geometry']))
      if poly.contains(pnt):
        return int(f.AREA_SHORT_CODE)

In [None]:
## Generate Nearest to metrics:
df_main["rating"] = df_main.apply(lambda row: get_rating(row['AgencyName'] + " " + row["Address"]), axis=1)
df_main["hospital"] = df_main.apply(lambda row: get_nearest(row["Address"], "hospital"), axis=1)
df_main["busStop"] = df_main.apply(lambda row: get_nearest(row["Address"], "bus station"), axis=1)
df_main["subwayStop"] = df_main.apply(lambda row: get_nearest(row["Address"], "subway station"), axis=1)
df_main["shelters"] = df_main.apply(lambda row: get_nearest(row["Address"], "homeless shelter"), axis=1)

In [None]:
# Add some lat, lng fields
df_main["LL"] = df_main.apply(lambda row: get_lat_lng(row["Address"]), axis=1)

In [None]:
# Get the region of all lat, lng fields in main df
df_main['id'] = df_main.apply(lambda row: get_region(row["LL"][0] , row['LL'][1]), axis=1)

In [None]:
## Merge regional data into the main df:
df_main = df_main.merge(df_region, left_on="id", right_on="id", how = 'left')

In [None]:
# Replace emptry or 0 ratings with NA
df_main['rating'].replace('', np.nan, inplace=True) 
df_main['rating'].replace(0, np.nan, inplace=True)
df_main = df_main.dropna(how = 'any') # remove those NA fields

In [None]:
# Display
%reload_ext google.colab.data_table
df_main

Unnamed: 0,AgencyName,Address,Neighbourhood_x,type,rating,hospital,busStop,subwayStop,shelters,LL,id,Neighbourhood_y,City Grants Funding $,Neighbourhood Equity Score,Salvation Army Donors,Walk Score,Watermain Breaks,density,income,Units,subsidized,BreakandEnter_2019,Homicide_2019,TheftOver_2019
0,Toronto Council Fire Native Cultural Centre,"439 Dundas St E\nToronto, ON M5A 2B1","Moss Park, 73",Disability support and services,4.6,892.275700,29.565170,16.749336,159.448646,"(43.659154, -79.3661795)",73,Moss Park,526252,50.11,317,95,6,14753,17025,3399.0,2926.0,234,3,25
1,Native Child and Family Services of Toronto,"179 Dowling Ave\nToronto, ON M6K 3B2","South Parkdale, 85",Disability support and services,4,877.944346,127.483561,166.737903,2286.731789,"(43.639268, -79.4392481)",85,South Parkdale,196602,33.1,204,83,5,9583,18595,1600.0,1011.0,89,1,22
2,Native Canadian Centre of Toronto,"16 Spadina Rd\nToronto, ON M5R 2S7","Annex, 95",Disability support and services,4.6,1552.735926,81.102020,129.163926,1693.825102,"(43.6681907, -79.4049466)",95,Annex,553355,78.6,1401,94,11,10863,26295,1436.0,830.0,212,1,42
3,Native Child and Family Services of Toronto,"30 College St\nToronto, ON M5G 1K2","Bay Street Corridor, 76",Disability support and services,3.8,261.999038,104.630671,116.599809,331.479337,"(43.66135449999999, -79.3845235)",76,Bay Street Corridor,2893324,77.07,1576,99,8,14097,23945,754.0,313.0,206,3,73
4,Miziwe Biik Aboriginal Employment and Training,"167 Gerrard St E\nToronto, ON M5A 2E4","Moss Park, 73",Disability support and services,4.6,452.409461,144.205396,704.109089,66.829661,"(43.6606612, -79.37388150000001)",73,Moss Park,526252,50.11,317,95,6,14753,17025,3399.0,2926.0,234,3,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1524,YMCA House,7 Vanauley Street,Toronto,SHELTER,3.3,776.394846,490.723608,129.116932,75.141010,"(43.6488364, -79.3982293)",78,Kensington-Chinatown,1.0544e+06,50.7,210,97,1,11806,16150,2064.0,1643.0,278,0,37
1525,YMCA Sprott House,21 Walmer Rd.,Toronto,SHELTER,4.3,1561.813360,131.537920,148.738290,1747.254561,"(43.6678959, -79.4055245)",95,Annex,553355,78.6,1401,94,11,10863,26295,1436.0,830.0,212,1,42
1526,YWCA - First Stop Woodlawn,80 Woodlawn Ave. East,Toronto,SHELTER,2,2559.681082,216.768964,276.556507,19.698942,"(43.6846797, -79.3898217)",98,Rosedale-Moore Park,38672,83.78,2101,84,8,4500,18000,520.0,99.0,99,0,14
1527,Youth Without Shelter,6 Warrendale Court,Etobicoke,SHELTER,4.3,1642.977489,146.993687,146.993687,0.000000,"(43.7362929, -79.58042689999999)",2,Mount Olive-Silverstone-Jamestown,238172,29.29,222,61,8,7291,25740,1288.0,1181.0,36,3,9
