In [74]:
import numpy as np
import math
from datascience import *
from scipy import stats

# Welcome to IAS-150's Data Science Module

Today we will be examing a data set that the UN produces every year, called the Gender Inequality Index. This is the UN's annual ranking of 188 countries in  terms of gender _inequality_. 

### Load in the UN Gender Inequality Index (GII) data from 2016
Note: the table has been modified slightly from its original format for ease of use. The original table can be found at: http://hdr.undp.org/en/composite/GII

In [75]:
data = Table.read_table('GII_data-numbers.csv')

## Clean Data:
##### (Only pay attention if you're interested)
Right now, all of the values that look like numbers are actually being stored as _strings_, which, in Python, are usually how ASCII characters are stored. Thus, you can't do normal mathematical operations on strings. We have to change these strings into _floats_, or floating-point decimals. That's what the code below does.

In [76]:
un = Table()
for label in data.labels:
    clean_col = make_array()
    for i in np.arange(len(data.column(label))):
        if data.column(label).item(i) == '..':
            clean_col = np.append(clean_col, np.nan)
        elif label == 'Country' or label == 'HDI rank':
            clean_col = np.append(clean_col, data.column(label).item(i))   
        else:
            clean_col = np.append(clean_col, float(data.column(label).item(i)))
    un.append_column(label, clean_col)
un

HDI rank,Country,Value (2015),Rank (2015),"Maternal mortality ratio (deaths per 100,000 live births)","Adolescent birth rate (births per 1,000 women ages 15–19)",Share of seats in parliament (% held by women),% Female population with at least some secondary education,% Male population with at least some secondary education,Female Labour force participation rate,Male Labour force participation rate
1,Norway,0.05,6,5,5.9,39.64,96.07,94.6,61.18,68.53
2,Australia,0.12,24,6,14.13,30.53,91.37,91.53,58.57,70.92
2,Switzerland,0.04,1,5,2.95,28.86,96.07,97.36,62.68,74.85
4,Germany,0.07,9,6,6.69,36.86,96.38,96.96,54.53,66.43
5,Denmark,0.04,2,6,4.04,37.43,89.08,98.53,58.04,66.16
5,Singapore,0.07,11,10,3.82,23.91,75.52,81.92,58.24,76.43
7,Netherlands,0.04,3,7,3.99,36.44,86.18,90.28,57.53,70.24
8,Ireland,0.13,26,8,10.43,19.91,86.76,82.22,52.38,67.82
9,Iceland,0.05,5,3,6.07,41.27,100.0,97.18,70.66,77.5
10,Canada,0.1,18,7,9.77,28.27,100.0,100.0,60.97,70.28


The 'Value' and 'Rank' columns describe how each country did last year (2015). As you can see, the lower the value, the higher the rank. Thus, in 2016, Norway had the lowest value, although the raw value for 2016 is not shown in this table.

In [77]:
# repeat process for just Asia data from prof. Change - might end up only using this in the end
data = Table.read_table('asia-numbers.csv')
asia = Table()
for label in data.labels:
    clean_col = make_array()
    for i in np.arange(len(data.column(label))):
        if data.column(label).item(i) == '..':
            clean_col = np.append(clean_col, np.nan)
        elif label == 'Country' or label == 'HDI rank':
            clean_col = np.append(clean_col, data.column(label).item(i))   
        else:
            clean_col = np.append(clean_col, float(data.column(label).item(i)))
    asia.append_column(label, clean_col)
asia
#table is already sorted by HDI rank. HDI != GII
# might need to add column 'unemployment rate (male:female ratio)' that is 1/unemployment (female:male ratio)
# same for sex ratio

Country,Overall HDI Rank (2015),Within Region HDI Rank (2015),"Maternal mortality ratio (deaths per 100,000 live births)","Adolescent birth rate (births per 1,000 women ages 15–19)",Share of seats in parliament (% held by women),% Female population with at least some secondary education,% Male population with at least some secondary education,Female Labour force participation rate,Male Labour force participation rate,Unemployment Rate (female to male ratio),Sex Ratio (male to female),"Suicide Rate (per 100,000 males)","Suicide Rate (per 100,000 females)",Mandatory paid maternity leave (days)
Korea (Republic of),10,1,11,1.62,16.33,88.84,94.56,50.01,71.82,0.94,1.07,41.7,18.0,90
Singapore,11,2,10,3.82,23.91,75.52,81.92,58.24,76.43,1.13,1.07,9.8,5.3,105
Cyprus,21,3,7,4.99,12.5,77.01,82.67,57.47,70.16,0.89,1.07,7.7,1.5,126
Japan,21,3,5,4.07,11.58,93.04,90.64,49.12,70.16,0.89,1.06,26.9,10.1,98
China,37,4,27,7.31,23.62,69.81,79.42,63.58,77.93,0.79,1.16,7.1,8.7,128
Kazakhstan,42,5,12,27.89,20.13,99.68,100.0,66.07,76.98,1.34,1.06,40.6,9.3,126
United Arab Emirates,46,6,6,29.66,22.5,77.42,64.49,41.9,91.56,3.13,1.05,3.9,1.7,45
Bahrain,48,7,15,13.48,15.0,61.58,55.56,39.2,85.39,8.28,1.04,11.6,2.9,60
Saudi Arabia,50,8,12,8.82,19.87,63.31,72.13,20.06,79.13,6.89,1.03,0.6,0.2,70
Mongolia,53,9,44,15.67,14.47,89.67,85.83,56.47,68.79,1.04,1.03,16.3,3.7,120


### Note: 
The index we are interested in is NOT shown in this table.

### How the GII is calculated:
![](gii_breakdown.png)

The Gender Inequality Index (GII) reflects gender-based disadvantage in three dimensions—reproductive health, empowerment and the labour market—for as many countries as data of reasonable quality allow. It shows the loss in potential human development due to inequality between female and male achievements in these dimensions. It ranges from 0, where women and men fare equally, to 1, where one gender fares as poorly as possible in all measured dimensions. (taken from UNDP technical notes)

In [78]:
np.mean(un.column(5))

nan

In [79]:
np.nanmean(un.column(5))

47.867595628415302

In [80]:
un.show()

HDI rank,Country,Value (2015),Rank (2015),"Maternal mortality ratio (deaths per 100,000 live births)","Adolescent birth rate (births per 1,000 women ages 15–19)",Share of seats in parliament (% held by women),% Female population with at least some secondary education,% Male population with at least some secondary education,Female Labour force participation rate,Male Labour force participation rate
1,Norway,0.05,6.0,5.0,5.9,39.64,96.07,94.6,61.18,68.53
2,Australia,0.12,24.0,6.0,14.13,30.53,91.37,91.53,58.57,70.92
2,Switzerland,0.04,1.0,5.0,2.95,28.86,96.07,97.36,62.68,74.85
4,Germany,0.07,9.0,6.0,6.69,36.86,96.38,96.96,54.53,66.43
5,Denmark,0.04,2.0,6.0,4.04,37.43,89.08,98.53,58.04,66.16
5,Singapore,0.07,11.0,10.0,3.82,23.91,75.52,81.92,58.24,76.43
7,Netherlands,0.04,3.0,7.0,3.99,36.44,86.18,90.28,57.53,70.24
8,Ireland,0.13,26.0,8.0,10.43,19.91,86.76,82.22,52.38,67.82
9,Iceland,0.05,5.0,3.0,6.07,41.27,100.0,97.18,70.66,77.5
10,Canada,0.1,18.0,7.0,9.77,28.27,100.0,100.0,60.97,70.28


In [81]:
# functions for computing gii

choose 2 categories from MMR, ADR, suicide rate, sex ratio

{come back to this whole idea later, for now, work on building fns for calculating GII as UN did originally, using only that data}

note: scipy gmean can take as many inputs as given - not limiting/hard-coding in that sense 

In [82]:
def fhealth(mmr, abr):
    #return np.sqrt((10 / mmr) * (1 / abr))
    # either of these should work
    return stats.mstats.gmean([10 / mmr, 1 / abr])

In [83]:
mhealth = 1

In [84]:
def empowerment(pr, se):
    # return
    return stats.mstats.gmean([pr, se])

In [85]:
# testing empowerment function on Norway
empowerment(un.column(6).item(0), un.column(7).item(0))

61.710734884621132

In [86]:
flabor = ...
mlabor = ...

In [87]:
# def within_female_across_dimensions(...):
#     # gmean of gmeans
#     return stats.mstats.gmean([fhealth(..., ...), empowerment(..., ...), flabor])
def within_female_across_dimensions(hlth, emp, lbr):    # inputs will be fhealth(..., ...), empowerment(..., ...), flabor
    # gmean of gmeans
    return stats.mstats.gmean([hlth, emp, lbr])

In [88]:
# def within_male_across_dimensions(...):
#     return stats.mstats.gmean([mhealth, empowerment(..., ...), mlabor])
def within_male_across_dimensions(hlth, emp, lbr):
    return stats.mstats.gmean([hlth, emp, lbr])

In [89]:
# generalize above functions
# gender is male or female
# def within_gender_across_dimensions(gender):
#     ...

In [90]:
# def across_gender_within_dimension(...):
#     # harmonic mean
#     return stats.mstats.hmean([within_female_across_dimensions(...), within_male_across_dimensions(...)])
# could have same inputs as within_gender fns
# or have those fns as inputs


# use fns as inputs
def across_gender_within_dimension(female, male):
    # harmonic mean
    return stats.mstats.hmean([female, male])

In [91]:
def step4():
    
    return stats.mstats.gmean([np.mean([fhealth(mmr, abr), mhealth]), np.mean([empowerment(fpr, fse), empowerment(mpr, mse)]), np.mean([flabor, mlabor])])


In [92]:
gii_column = make_array()
for i in np.arange(188): # row indices 0-187
    row = un.row(i)
    mmr = row.item(4)
    abr = row.item(5)
    fpr = row.item(6)   # female representation, male = 100 - female
    mpr = 100 - fpr
    # need fse and mse
    fse = row.item(7) 
    mse = row.item(8)
    flabor = row.item(9)
    mlabor = row.item(10)
    gii = 1 - (across_gender_within_dimension(...) / step4(...))
    gii_column = np.append(gii_column, gii)

TypeError: across_gender_within_dimension() missing 1 required positional argument: 'male'

In [113]:
# THIS WOULD WORK (I THINK) IF ALL DATA WAS THERE
def gii_calculator():
    def step4():
        return stats.mstats.gmean([np.mean([fhealth(mmr, abr), mhealth]), np.mean([empowerment(fpr, fse), empowerment(mpr, mse)]), np.mean([flabor, mlabor])]) 
    gii_column = make_array()
    for i in np.arange(188): # row indices 0-187
        row = un.row(i)
        mmr = row.item(4)
        abr = row.item(5)
        fpr = row.item(6)   # female representation, male = 100 - female
        mpr = 100 - fpr
        # need fse and mse
        fse = row.item(7) 
        mse = row.item(8)
        flabor = row.item(9)
        mlabor = row.item(10)
        gii = 1 - (
        across_gender_within_dimension(
            within_female_across_dimensions(fhealth(mmr, abr), empowerment(fpr, fse), flabor), 
            within_male_across_dimensions(mhealth, empowerment(mpr, mse), mlabor)
        )
        / step4()
        )
        #print(gii)
        gii_column = np.append(gii_column, gii)
    un_with_gii = un.copy().with_column("GII (2016)", gii_column)
    return un_with_gii

In [96]:
# testing to see if it works on norway
def norway():
    def step4():
        return stats.mstats.gmean([np.mean([fhealth(mmr, abr), mhealth]), np.mean([empowerment(fpr, fse), empowerment(mpr, mse)]), np.mean([flabor, mlabor])]) 
    i = 0
    row = un.row(i)
    print(row)
    mmr = row.item(4)
    abr = row.item(5)
    fpr = row.item(6)   # female representation, male = 100 - female
    mpr = 100 - fpr
            # need fse and mse
    fse = row.item(7)
    mse = row.item(8)
    flabor = row.item(9)
    mlabor = row.item(10)
    gii = 1 - (across_gender_within_dimension(within_female_across_dimensions(fhealth(mmr, abr), empowerment(fpr, fse), flabor), within_male_across_dimensions(mhealth, empowerment(mpr, mse), mlabor)) / step4())
    return gii
norway()
# it works on norway!

Row(HDI rank=1.0, Country='Norway', Value (2015)=0.050000000000000003, Rank (2015)=6.0, Maternal mortality ratio (deaths per 100,000 live births)=5.0, Adolescent birth rate (births per 1,000 women ages 15–19)=5.9000000000000004, Share of seats in parliament (% held by women)=39.640000000000001, % Female population with at least some secondary education=96.069999999999993, % Male population with at least some secondary education=94.599999999999994, Female Labour force participation rate=61.18, Male Labour force participation rate=68.530000000000001)


0.024154182315289874

In [112]:
gii_calculator()

0.0241541823153
0.0882297989784
0.0208440932259
0.042221249609
0.0232016930674
0.068227259404
0.0300663933013
0.112948726781
0.00939301773934
0.0775362034454
0.203160300589


  if np.all(a > 0):


ValueError: Harmonic mean only defined if all elements greater than zero