## Importing our sunshine list data

In [30]:
import pandas as pd
import re
from sklearn.tree import DecisionTreeClassifier
from zipfile import ZipFile
import glob

In [32]:
all_files = glob.glob("../../raw/genderpaygap/*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0, encoding="utf-8", encoding_errors='ignore')
    df.columns = df.columns.str.lower()
    df = df.rename(columns={"calendar year": "year", "salary paid": "salary", "taxable benefits": "benefits"})
    df["year"] = int(re.search("[0-9]{4}", filename, re.IGNORECASE).group(0))
    li.append(df)

raw = pd.concat(li, axis=0, ignore_index=True)

raw.sample(5)

Unnamed: 0,sector,last name,first name,salary,benefits,employer,job title,year,_docid
49270,Municipalities and Services,SCOTT,GORDON,114120.01,717.06,City of Toronto - Police Service,Detective,2012,
554269,Government of Ontario - Ministries,Maceachern,Jeffery,"$107,721.76",$142.56,Community Safety and Correctional Services / S...,Investigator / Enquêteur,2017,
648926,School Boards,Calligan,Marlene,"$103,724.33",$0.00,Toronto District School Board,Vice Principal Elementary,2017,
1184179,Universities,Liang,Ben,"$205,998.06",$151.08,University Of Toronto,Professor of Electrical and Computer Engineering,2020,
1368903,School Boards,Huot,Paul,102432.4,86.73,Greater Essex County District School Board,Elementary Teacher,2021,174188.0


In [33]:
data = raw.copy()

data.columns = data.columns.str.lower()

for label, content in data[["sector", "last name", "first name", "employer", "job title"]].items():
    data[label] = (data[label]
                        .str.upper()
                        .str.replace("\s(AND){1}\s", " & ", regex=True)
                        .str.replace("\-", "–", regex=True)
                        .str.replace("*", "", regex=False)
                )
    
data["salary"] = (data["salary"]
                  .astype(str)
                  .str.replace("\$|,", "", regex=True)
                  .astype(float)
                  )
    
data.sample(5)

Unnamed: 0,sector,last name,first name,salary,benefits,employer,job title,year,_docid
1423263,UNIVERSITIES,GHARABAGHI,BAHRAM,176574.74,805.84,UNIVERSITY OF GUELPH,PROFESSOR & ASSOCIATE DIRECTOR,2021,228548.0
1246024,HOSPITALS & BOARDS OF PUBLIC HEALTH,HULSMAN,DAVID,105570.75,477.98,HALTON HEALTHCARE SERVICES CORPORATION,BUSINESS OPERATIONS CONSULTANT,2021,51309.0
39220,MUNICIPALITIES & SERVICES,YETMAN,BRIAN,113970.98,831.14,CITY OF CAMBRIDGE,"CAPTAIN, FIRE FIGHTER",2012,
779227,SCHOOL BOARDS,FAWCETT,KAY,121815.52,$ 516.01,BLUEWATER DISTRICT SCHOOL BOARD,SECONDARY PRINCIPAL,2018,
1151940,SCHOOL BOARDS,LLOYD,ERIKA,111646.08,$0.00,TORONTO DISTRICT SCHOOL BOARD,VICE PRINCIPAL ELEMENTARY,2020,


In [34]:
data["first_name_cleaned"] = (data["first name"]
                              .str.replace("\s+[A-Z]+\.*", "", regex=True)
                              .str.upper()
                              .dropna()
                              )

## Training the model

In [35]:
li = []

file = ZipFile('data/names.zip')

for i in file.namelist():
       df = pd.read_csv(file.open(i), header=None)
       df.columns = ["Name", "Gender", "Count"]
       year = re.search("[0-9]{4}", i)[0]
       df["Year"] = year
       li.append(df)
       

names = pd.concat(li)
names = names.loc[names["Year"] == "2020"]

names.head()

Unnamed: 0,Name,Gender,Count,Year
0,Olivia,F,17535,2020
1,Emma,F,15581,2020
2,Ava,F,13084,2020
3,Charlotte,F,13003,2020
4,Sophia,F,12976,2020


In [44]:
pivot = names.pivot(columns="Gender", index="Name", values="Count").fillna(0)
pivot = pivot[pivot.sum(axis=1) > 50]

pivot["chance_male"] = pivot["M"] / (pivot.sum(axis=1))
pivot = pivot.sort_values("chance_male", ascending=False)

pivot.loc[pivot["chance_male"] >= 0.6, "gender"] = "MALE"
pivot.loc[pivot["chance_male"] <= 0.4, "gender"] = "FEMALE"
pivot = pivot.fillna("UNKNOWN").loc[:, ["gender"]].reset_index()

pivot

Gender,Name,gender
0,Aaden,MALE
1,Jairo,MALE
2,Jakhi,MALE
3,Jakob,MALE
4,Jakobe,MALE
...,...,...
5447,Klani,FEMALE
5448,Klara,FEMALE
5449,Ashly,FEMALE
5450,Ashanti,FEMALE
