In [1]:
import pandas as pd
import requests
import numpy as np

In [2]:
# create a simple series
s = pd.Series([5, -15, 10, 20], index=['a', 'b', 'c', 'd'])
s

a     5
b   -15
c    10
d    20
dtype: int64

In [3]:
# create a two-dimensional data structure
data = {'Province': ['Ontario', 'Manitoba', 'British Columbia'],
        'Capital': ['Toronto', 'Winnipeg', 'Victoria'],
        'Population': [2731571, 749534, 92141]}

census = pd.DataFrame(data,columns=['Province', 'Capital', 'Population'])
census.head()

Unnamed: 0,Province,Capital,Population
0,Ontario,Toronto,2731571
1,Manitoba,Winnipeg,749534
2,British Columbia,Victoria,92141


In [4]:
# get a subset
census[1:]

Unnamed: 0,Province,Capital,Population
1,Manitoba,Winnipeg,749534
2,British Columbia,Victoria,92141


In [5]:
# filtering
census[census['Population']>100000]

Unnamed: 0,Province,Capital,Population
0,Ontario,Toronto,2731571
1,Manitoba,Winnipeg,749534


In [6]:
# sort by values
census.sort_values(by='Province')

Unnamed: 0,Province,Capital,Population
2,British Columbia,Victoria,92141
1,Manitoba,Winnipeg,749534
0,Ontario,Toronto,2731571


In [7]:
# rank things
census.rank()

Unnamed: 0,Province,Capital,Population
0,3.0,1.0,3.0
1,2.0,3.0,2.0
2,1.0,2.0,1.0


In [8]:
# get some information about your dataframe
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Province    3 non-null      object
 1   Capital     3 non-null      object
 2   Population  3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [9]:
# get the min or max
census.min()
#df.max()

Province      British Columbia
Capital                Toronto
Population               92141
dtype: object

In [10]:
# summary stats
census.describe()

Unnamed: 0,Population
count,3.0
mean,1191082.0
std,1373998.0
min,92141.0
25%,420837.5
50%,749534.0
75%,1740552.0
max,2731571.0


In [11]:
# Get the dataset metadata by passing package_id to the package_search endpoint
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "99ff3657-b2e7-4005-a6fd-c36838ccc96d"}
package = requests.get(url, params = params).json()
# print(package["result"])

In [12]:
# Get the data by passing the resource_id to the datastore_search endpoint
for idx, resource in enumerate(package["result"]["resources"]):
    if resource["datastore_active"]:
        url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search"
        p = { "id": resource["id"] }
        data = requests.get(url, params = p).json()
        df = pd.DataFrame(data["result"]["records"])
        break

In [13]:
df.head()

Unnamed: 0,_id,School Name,Enrolled population,DTP coverage rate (%),DTP Religious exemption rate (%),MMR coverage rate (%),MMR Religious exemption rate (%),Lat,Lng
0,1,A Y JACKSON SECONDARY SCHOOL,1070,91.0,1.0,95.7,1.0,43.805261,-79.366555
1,2,ACADEMIE ALEXANDRE-DUMAS,110,88.2,1.8,90.0,1.8,43.762419,-79.179765
2,3,ADAM BECK JUNIOR PUBLIC SCHOOL,247,91.9,3.6,94.3,3.6,43.683152,-79.288488
3,4,AFRICENTRIC ALTERNATIVE SCHOOL,66,71.2,19.7,72.7,19.7,43.745424,-79.488261
4,5,AGINCOURT COLLEGIATE INSTITUTE,1253,90.7,1.0,97.8,1.0,43.788874,-79.27891


## For everything else, use a cheatsheet: 
https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf