# Ontario sunshine list 2020

In [1]:
import pandas as pd
import numpy as np
import json
import requests

pd.options.display.float_format = '{:,}'.format

In [129]:
raw = pd.read_csv('../raw/RAW 2020 ONTARIO SUNSHINE LIST.csv', encoding="latin-1", thousands=',')

raw["Sector"] = raw["Sector"].str.strip()
raw["Name"] = raw["First Name"] + " " + raw["Last Name"]
raw["Salary"] = raw["Salary"].str.replace(pat="$", repl="").str.replace(pat=",", repl="", regex=False).astype(float)
raw["Benefits"] = raw["Benefits"].str.replace(pat="$", repl="").str.replace(pat=",", repl="", regex=False).astype(float)
raw = raw[~raw["Sector"].str.contains("Second")]
raw = raw.sort_values('Salary', ascending=False)

# Binning into brackets
min_value = raw['Salary'].min()
max_value = raw['Salary'].max()

bins = np.linspace(min_value,max_value,5)
for i, number in enumerate(bins, 0):
    bins[i] = round(number, -4)
bins = np.append(bins, 5000000)
labels = []

for i, number in enumerate(bins, 0):
    try:
        label = "$" + str(round(bins[i])) + " - $" + str(round(bins[i+1]))
        label = label
        labels.append(label)
    except:
        print(labels)
    
raw["Earnings Bracket"] = pd.cut(raw['Salary'], bins=bins, labels=labels, include_lowest=True)
# raw["Earnings Bracket"] = raw["Earnings Bracket"].fillna(">" + str(bins[-1]))

display(raw)

  raw["Salary"] = raw["Salary"].str.replace(pat="$", repl="").str.replace(pat=",", repl="", regex=False).astype(float)
  raw["Benefits"] = raw["Benefits"].str.replace(pat="$", repl="").str.replace(pat=",", repl="", regex=False).astype(float)


['$100000 - $380000', '$380000 - $660000', '$660000 - $950000', '$950000 - $1230000', '$1230000 - $5000000']


Unnamed: 0,Sector,Last Name,First Name,Salary,Benefits,Employer,Job Title,Year,Name,Earnings Bracket
126331,Ontario Power Generation,Hartwick,Kenneth,1228517.0,7539.84,Ontario Power Generation,President and Chief Executive Officer,2020,Kenneth Hartwick,$950000 - $1230000
128457,Ontario Power Generation,Miniere,Dominique,1131767.04,3922.56,Ontario Power Generation,"President, Nuclear",2020,Dominique Miniere,$950000 - $1230000
126080,Ontario Power Generation,Granville,Sean,901606.32,2450.88,Ontario Power Generation,Chief Operations Officer and Chief Nuclear Off...,2020,Sean Granville,$660000 - $950000
128014,Ontario Power Generation,Martelli,Michael,850332.27,3499.2,Ontario Power Generation,Chief Projects Officer,2020,Michael Martelli,$660000 - $950000
65303,Hospitals & Boards of Public Health,Smith,Kevin,844992.2,87502.71,University Health Network,President and Chief Executive Officer,2020,Kevin Smith,$660000 - $950000
...,...,...,...,...,...,...,...,...,...,...
189456,Universities,Elrayes,Maged,100000.0,0.0,University Of Western Ontario,Clinical Fellow,2020,Maged Elrayes,$100000 - $380000
134869,Other Public Sector Employers,Hyslop,Carolyn,100000.0,0.0,Canadian Canoe Museum,Executive Director,2020,Carolyn Hyslop,$100000 - $380000
135149,Other Public Sector Employers,Karn,Denyse,100000.0,830.52,Toronto Festival Of Arts Culture And Creativity,"Director, Production",2020,Denyse Karn,$100000 - $380000
137936,Other Public Sector Employers,Rubino,Rosaleen,100000.0,0.0,Rosaleen Rubino Physiotherapy Professional Cor...,Director,2020,Rosaleen Rubino,$100000 - $380000


## Binning data

In [130]:
bracket_counts = pd.pivot_table(raw, index="Earnings Bracket", columns="Sector", aggfunc='count', values="Name")
percent_counts = pd.DataFrame()

for label, col in bracket_counts.items():
    percent_counts[label] = round((bracket_counts[label] / bracket_counts[label].sum()) * 100, 2)

display(percent_counts)
bracket_counts.to_clipboard()

Unnamed: 0_level_0,Colleges,Crown Agencies,Government of Ontario - Judiciary,Government of Ontario - Legislative Assembly and Offices,Government of Ontario - Ministries,Hospitals & Boards of Public Health,Municipalities & Services,Ontario Power Generation,Other Public Sector Employers,School Boards,Universities
Earnings Bracket,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
$100000 - $380000,99.95,99.6,100.0,100.0,99.93,99.57,99.99,99.71,99.73,100.0,99.65
$380000 - $660000,0.05,0.36,0.0,0.0,0.07,0.4,0.01,0.22,0.27,0.0,0.35
$660000 - $950000,0.0,0.03,0.0,0.0,0.0,0.03,0.0,0.05,0.0,0.0,0.0
$950000 - $1230000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0
$1230000 - $5000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [131]:
categories = raw["Sector"].unique()
sheets = []

for category in categories:
    df = raw[raw["Sector"] == category]
    df["Name/Title"] = "**" + df["Name"] + "**<br>" + df["Job Title"]
    df = df[["Name/Title", "Employer", "Salary", "Benefits"]]
    sheets.append(df)

writer = pd.ExcelWriter('exports/sunshine_list.xlsx')
for i, df in enumerate(sheets, 0):
    df.to_excel(writer, index=False,sheet_name=f'{categories[i]}'[:30])
writer.save()


# display(colleges)
# colleges.to_clipboard()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Name/Title"] = "**" + df["Name"] + "**<br>" + df["Job Title"]


## Gender analysis

Bring in a list of 250,000+ names to help us identify male/female.

In [153]:
url = 'https://parseapi.back4app.com/classes/Complete_List_Names?limit=500000'
headers = {
    'X-Parse-Application-Id': 'zsSkPsDYTc2hmphLjjs9hz2Q3EXmnSxUyXnouj1I',
    'X-Parse-Master-Key': '4LuCXgPPXXO2sU5cXm6WwpwzaKyZpo3Wpj4G4xXK'
}
data = requests.get(url, headers=headers).json()
data2 = json.dumps(data["results"])
names = pd.read_json(data2)

display(names)

Unnamed: 0,objectId,Name,Gender,createdAt,updatedAt
0,lSxg9sIUv9,Will,male,2020-01-23T23:31:09.261Z,2020-01-23T23:31:09.261Z
1,Ypp4vpokki,James,male,2020-01-23T23:31:09.241Z,2020-01-23T23:31:09.241Z
2,tZYFQ1Yjyj,Samuel,male,2020-01-23T23:31:09.260Z,2020-01-23T23:31:09.260Z
3,LxQmByBQ1y,John,male,2020-01-23T23:31:09.240Z,2020-01-23T23:31:09.240Z
4,2S3LArF0dw,George,male,2020-01-23T23:31:09.241Z,2020-01-23T23:31:09.241Z
...,...,...,...,...,...
257995,kAzsJnXLBn,Diya,female,2020-01-23T23:43:40.706Z,2020-01-23T23:43:40.706Z
257996,1Do0BIVl3n,Kenley,female,2020-01-23T23:43:40.706Z,2020-01-23T23:43:40.706Z
257997,X5boZIpAnZ,Elianna,female,2020-01-23T23:43:40.706Z,2020-01-23T23:43:40.706Z
257998,u6izCGBJUz,Iyana,female,2020-01-23T23:43:40.706Z,2020-01-23T23:43:40.706Z


In [192]:
name_count = names.value_counts(subset=["Name","Gender"]).reset_index()
name_count = pd.pivot(name_count, columns="Gender", index="Name").fillna(0)
name_count.columns = name_count.columns.droplevel(0)
name_count["chance_male"] = name_count["male"] / (name_count["male"] + name_count["female"])
name_count["Gender"] = np.where(name_count['chance_male']>=0.65, 'Male', np.where(name_count['chance_male']<=0.35, 'Female', pd.NA))

name_count.to_clipboard()

gender_list = name_count[["Gender"]].reset_index()


display(gender_list)

Gender,Name,Gender.1
0,Aaden,Male
1,Aaliyah,Female
2,Aarav,Male
3,Aaron,Male
4,Ab,Male
...,...,...
6777,Zula,Female
6778,malece,Male
6779,maled,Male
6780,malesie,Male


In [194]:
with_gender = pd.merge(left=raw, right=gender_list, left_on='First Name', right_on="Name", how="left")
with_gender = with_gender.drop_duplicates()

with_gender = with_gender[["Sector", "Name_x", "Job Title", "Employer", "Salary", "Benefits", "Earnings Bracket", "Gender"]]
with_gender["Gender"] = with_gender["Gender"].fillna("")

display(with_gender)
with_gender.to_clipboard()


Unnamed: 0,Sector,Name_x,Job Title,Employer,Salary,Benefits,Earnings Bracket,Gender
0,Ontario Power Generation,Kenneth Hartwick,President and Chief Executive Officer,Ontario Power Generation,1228517.0,7539.84,$950000 - $1230000,Male
1,Ontario Power Generation,Dominique Miniere,"President, Nuclear",Ontario Power Generation,1131767.04,3922.56,$950000 - $1230000,
2,Ontario Power Generation,Sean Granville,Chief Operations Officer and Chief Nuclear Off...,Ontario Power Generation,901606.32,2450.88,$660000 - $950000,Male
3,Ontario Power Generation,Michael Martelli,Chief Projects Officer,Ontario Power Generation,850332.27,3499.2,$660000 - $950000,Male
4,Hospitals & Boards of Public Health,Kevin Smith,President and Chief Executive Officer,University Health Network,844992.2,87502.71,$660000 - $950000,Male
...,...,...,...,...,...,...,...,...
205714,Universities,Maged Elrayes,Clinical Fellow,University Of Western Ontario,100000.0,0.0,$100000 - $380000,
205715,Other Public Sector Employers,Carolyn Hyslop,Executive Director,Canadian Canoe Museum,100000.0,0.0,$100000 - $380000,Female
205716,Other Public Sector Employers,Denyse Karn,"Director, Production",Toronto Festival Of Arts Culture And Creativity,100000.0,830.52,$100000 - $380000,
205717,Other Public Sector Employers,Rosaleen Rubino,Director,Rosaleen Rubino Physiotherapy Professional Cor...,100000.0,0.0,$100000 - $380000,
