In [1]:
# Dependencies
import pandas as pd
import requests
from pprint import pprint

In [2]:
# Base url for all toronto open data
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
p = {"id": "major-crime-indicators"}
# get package resources
package = requests.get(url, params = p).json()
# Look at resources and find the one that is datastore_active
for idx, resource in enumerate(package["result"]["resources"]):
    # once we find the datastore_active resource, get the id
    if resource["datastore_active"]:
        resource_id = resource["id"]
# using that id and limit 0, get just the metadata first to find total items
# based on this algorithm:
# https://support.smartbear.com/qacomplete/docs/developer/api/rest/api/reference/paging.html
limit = 0

# build url for retrieve data
url = base_url + "/api/3/action/datastore_search"
# include resource id and limit in url parameters
p = {
    'id': resource_id,
    'limit': limit
}
# get metadata, and pprint it to find total items
metadata = requests.get(url, params = p).json()
pprint(metadata)

{'help': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/help_show?name=datastore_search',
 'result': {'_links': {'next': '/api/3/action/datastore_search?id=af452875-cfdd-4596-a08a-7b93b65ea4f0&limit=0&offset=0',
                       'start': '/api/3/action/datastore_search?id=af452875-cfdd-4596-a08a-7b93b65ea4f0&limit=0'},
            'fields': [{'id': '_id', 'type': 'int'},
                       {'id': 'EVENT_UNIQUE_ID',
                        'info': {'notes': 'Offence Number'},
                        'type': 'text'},
                       {'id': 'REPORT_DATE',
                        'info': {'notes': 'Date Offence was Reported'},
                        'type': 'date'},
                       {'id': 'OCC_DATE',
                        'info': {'notes': 'Date of Offence'},
                        'type': 'date'},
                       {'id': 'REPORT_YEAR',
                        'info': {'notes': 'Year Offence was Reported'},
                        'type': 'f

In [3]:
# within the 'result' item,
# 'total' is the total amount of items in dataset
# 'fields' contains data of what each column means
# and the actual data is in 'records' as a list of dictionaries

# Make df with 'fields' data and output as csv in case we want to check it out later, we don't need to request again
col_data = pd.DataFrame(metadata['result']['fields'])
col_data.to_csv('resources/column_metadata.csv', header = True, index = False)
col_data

Unnamed: 0,id,type,info
0,_id,int,
1,EVENT_UNIQUE_ID,text,{'notes': 'Offence Number'}
2,REPORT_DATE,date,{'notes': 'Date Offence was Reported'}
3,OCC_DATE,date,{'notes': 'Date of Offence'}
4,REPORT_YEAR,float8,{'notes': 'Year Offence was Reported'}
5,REPORT_MONTH,text,{'notes': 'Month Offence was Reported'}
6,REPORT_DAY,float8,{'notes': 'Day of the Month Offence was Report...
7,REPORT_DOY,float8,{'notes': 'Day of the Year Offence was Reported'}
8,REPORT_DOW,text,{'notes': 'Day of the Week Offence was Reported'}
9,REPORT_HOUR,float8,{'notes': 'Hour Offence was Reported'}


In [4]:



# get total items from metadata
total_items = metadata['result']['total']
total_items

323296

In [5]:
# Update parameters to include both limit and offset
# From documentation, we know the max limit is 32000
# so we set, limit to that
p['limit'] = 32000

# offset will start at 0 and increase by 32000 each loop time
p['offset'] = 0

# create list to contain data
data_list = []

n = 1

# While offset is under total_items
while p['offset'] < total_items:
    # log number of request and parameters to be used
    print(f"request #{n} with parameters: {p}")
    # get data that is inside 'result' and 'records'
    data = requests.get(url, params = p).json()['result']['records']
    # extend data list
    data_list.extend(data)
    # To finish loop section, increase offset by limit
    p['offset'] += p['limit']
    n += 1

# Final request: does not use limit, to get all the rest of the data
# update to not include limit in parameters
p = {
    'id': resource_id,
    'offset': p['offset']
}
# log number of request and parameters to be used
print(f"request #{n} with parameters: {p}")
# get data that is inside 'result' and 'records'
data = requests.get(url, params = p).json()['result']['records']
# extend data list
data_list.extend(data)
# Create dataframe with data
df = pd.DataFrame(data_list)
# print total length of dataframe, to confirm we got everything
print(f"\nThe total items in dataset is {total_items} and we got {len(df)}.")

request #1 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 0}


request #2 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 32000}
request #3 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 64000}
request #4 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 96000}
request #5 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 128000}
request #6 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 160000}
request #7 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 192000}
request #8 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 224000}
request #9 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 256000}
request #10 with parameters: {'id': 'af452875-cfdd-4596-a08a-7b93b65ea4f0', 'limit': 32000, 'offset': 288000}
request #11 with para

In [6]:
# Check head of dataframe
df.head()

Unnamed: 0,_id,EVENT_UNIQUE_ID,REPORT_DATE,OCC_DATE,REPORT_YEAR,REPORT_MONTH,REPORT_DAY,REPORT_DOY,REPORT_DOW,REPORT_HOUR,...,LOCATION_TYPE,PREMISES_TYPE,UCR_CODE,UCR_EXT,OFFENCE,MCI_CATEGORY,HOOD_158,NEIGHBOURHOOD_158,HOOD_140,NEIGHBOURHOOD_140
0,1,GO-20141262074,2014-01-01,1998-06-01,2014,January,1,1,Wednesday,12.0,...,"Apartment (Rooming House, Condo)",Apartment,1480,110,Administering Noxious Thing,Assault,38,Lansing-Westgate,38,Lansing-Westgate (38)
1,2,GO-20141260701,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,3.0,...,"Commercial Dwelling Unit (Hotel, Motel, B & B,...",Commercial,2120,200,B&E,Break and Enter,70,South Riverdale,70,South Riverdale (70)
2,3,GO-20141260889,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,4.0,...,"Apartment (Rooming House, Condo)",Apartment,1430,100,Assault,Assault,74,North St.James Town,74,North St.James Town (74)
3,4,GO-20141260973,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,4.0,...,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,2130,210,Theft Over,Theft Over,NSA,NSA,NSA,NSA
4,5,GO-20141261050,2014-01-01,2014-01-01,2014,January,1,1,Wednesday,4.0,...,"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,1430,100,Assault,Assault,69,Blake-Jones,66,Danforth (66)


In [7]:
# Write to csv
df.to_csv('resources/mci_all.csv', header = True, index = False)
