In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

In [2]:
data1 = pd.read_csv('total_df.csv')
data1.head()

Unnamed: 0.1,Unnamed: 0,country,year,ratio
0,109,Afghanistan,2020,0.806813
1,0,Afghanistan,2014,0.603998
2,1,Albania,2012,0.867946
3,110,Albania,2020,0.934241
4,111,Argentina,2021,0.746342


In [3]:
data1.shape

(246, 4)

In [4]:
data1 = data1.drop(['Unnamed: 0'], axis=1)
data1.head()

Unnamed: 0,country,year,ratio
0,Afghanistan,2020,0.806813
1,Afghanistan,2014,0.603998
2,Albania,2012,0.867946
3,Albania,2020,0.934241
4,Argentina,2021,0.746342


In [5]:
usa = data1[data1['country']=='United States']
usa

Unnamed: 0,country,year,ratio
236,United States,2021,0.789898
237,United States,2011,0.750033


In [6]:
country_year_counts = data1.groupby('country')['year'].count().reset_index()
country_year_counts.head()

Unnamed: 0,country,year
0,Afghanistan,2
1,Albania,2
2,Argentina,2
3,Armenia,2
4,Australia,2


In [7]:
country_year_counts['year'].describe()

count    137.000000
mean       1.795620
std        0.404727
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        2.000000
Name: year, dtype: float64

In [8]:
# ok there are countries that we just have 1 year for

data1['country'].nunique()

137

In [9]:
singles = country_year_counts[country_year_counts['year']==1]
singles.shape

(28, 2)

In [10]:
singles['country'].values

array(['Barbados', 'Brunei Darussalam', 'Burkina Faso', 'Burundi',
       'Cameroon', 'Cape Verde', 'Chad', 'Comoros',
       'Congo, Democratic Republic of the', 'Djibouti', 'Eswatini',
       'Ethiopia', 'Georgia', 'Guinea', 'Guinea-Bissau', 'Kenya',
       "Lao People's Democratic Republic", 'Lebanon', 'Lesotho', 'Malawi',
       'Marshall Islands', 'Nepal', 'Réunion', 'Sierra Leone', 'Suriname',
       'Tonga', 'Uzbekistan', 'Yemen'], dtype=object)

In [11]:
# sort data1 by country and year 

data1_sorted = data1.sort_values(['country','year'])
data1_sorted.head()

Unnamed: 0,country,year,ratio
1,Afghanistan,2014,0.603998
0,Afghanistan,2020,0.806813
2,Albania,2012,0.867946
3,Albania,2020,0.934241
5,Argentina,2011,0.746261


In [13]:
# merge data1 to itself on country, use _start and _end as suffixes 

data2 = data1_sorted.merge(data1_sorted, on='country', suffixes=('_start','_end'))
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end
0,Afghanistan,2014,0.603998,2014,0.603998
1,Afghanistan,2014,0.603998,2020,0.806813
2,Afghanistan,2020,0.806813,2014,0.603998
3,Afghanistan,2020,0.806813,2020,0.806813
4,Albania,2012,0.867946,2012,0.867946


In [14]:
data2.shape

(464, 5)

In [16]:
# select only rows where year_start is before year_end 

data2 = data2[data2['year_start'] < data2['year_end']]
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end
1,Afghanistan,2014,0.603998,2020,0.806813
5,Albania,2012,0.867946,2020,0.934241
9,Argentina,2011,0.746261,2021,0.746342
13,Armenia,2013,0.634726,2020,0.701357
17,Australia,2011,0.66247,2020,0.702274


In [17]:
data2.shape

(109, 5)

In [18]:
# look at the changes in total ratio, over how long, and what is the relative rate of change 

data2['delta_year'] = data2['year_end'] - data2['year_start']
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year
1,Afghanistan,2014,0.603998,2020,0.806813,6
5,Albania,2012,0.867946,2020,0.934241,8
9,Argentina,2011,0.746261,2021,0.746342,10
13,Armenia,2013,0.634726,2020,0.701357,7
17,Australia,2011,0.66247,2020,0.702274,9


In [19]:
data2['delta_ratio'] = data2['ratio_end'] - data2['ratio_start']
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815
5,Albania,2012,0.867946,2020,0.934241,8,0.066295
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631
17,Australia,2011,0.66247,2020,0.702274,9,0.039804


In [20]:
# are there any NaNs? if they haven't been dropped already, this is the time to do so 

data2.isna().sum()

country        0
year_start     0
ratio_start    0
year_end       0
ratio_end      0
delta_year     0
delta_ratio    0
dtype: int64

In [21]:
# this is to compare each country to its own starting point 
# a way to normalize since the different countries have different start and end years 

data2['delta_ratio_rel'] = 100 * data2['delta_ratio'] / data2['ratio_start']
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844


In [22]:
# this is a proxy measure of how fast/slow each country is to approaching gender parity  
# the unit is "% gap closed per year"

data2['change_rate'] = data2['delta_ratio_rel'] / data2['delta_year']
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697,5.596449
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155,0.954769
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858,0.001086
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684,1.499669
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844,0.667604


In [23]:
# gut checks: where is the US? where is Poland? 

data2[data2['country']=='United States']

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate
447,United States,2011,0.750033,2021,0.789898,10,0.039865,5.315105,0.53151


In [24]:
data2[data2['country']=='Poland']

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate
335,Poland,2011,0.869717,2020,0.830105,9,-0.039612,-4.554606,-0.506067


In [29]:
# instead of plotting change rate 
# maybe the better way to show progress (and how slow it is), is to estimate how many more years it will take 
# to reach gender pay parity 
# caveat: this is an oversimplified calculation 
# assuming change rate is constant, i.e. countries don't speed up, slow down, or revert directions 

data2['multiplier'] = data2['change_rate'].apply(lambda x: (x/100) + 1)
data2.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate,multiplier
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697,5.596449,1.055964
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155,0.954769,1.009548
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858,0.001086,1.000011
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684,1.499669,1.014997
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844,0.667604,1.006676


In [30]:
data2['multiplier'].describe()

count    109.000000
mean       1.015981
std        0.082688
min        0.860877
25%        0.998439
50%        1.006048
75%        1.015503
max        1.805550
Name: multiplier, dtype: float64

In [31]:
# the ones with values < 1 means the change rate was NEGATIVE 
# i.e., their most recent female:male income ratio is worse compared to their previous timepoint 

data2b = data2[data2['multiplier']>1]
data2b.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate,multiplier
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697,5.596449,1.055964
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155,0.954769,1.009548
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858,0.001086,1.000011
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684,1.499669,1.014997
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844,0.667604,1.006676


In [33]:
data2b.shape

(76, 10)

In [34]:
# wow this number dropped a lot; will revisit the other countries in a bit 
# continue calculations re: when the country will reach gender parity 
# let's treat the change_rate like a fixed annual interest rate 
# if we start an account with $100, the interest rate is 10%, and we don't add more money to the account 
# then after 1 year, we have 1.1 * $100
# after 2 years, we have 1.1 * 1.1 * $100 
# so the formula is 
# (starting number) * (multiplier ** years)
# in this case, we use the ratio_end, multipler, and target number of 1 (female income = male income) to solve for years
# years = log(1/starting number) / log(multiplier)

data2b['years_to_parity'] = ((np.log10(1/data2b['ratio_end']))/np.log10(data2b['multiplier']))
data2b.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2b['years_to_parity'] = ((np.log10(1/data2b['ratio_end']))/np.log10(data2b['multiplier']))


Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate,multiplier,years_to_parity
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697,5.596449,1.055964,3.942064
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155,0.954769,1.009548,7.158306
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858,0.001086,1.000011,26945.764376
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684,1.499669,1.014997,23.831349
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844,0.667604,1.006676,53.116803


In [37]:
data2b = data2b.drop('years_to_parity', axis=1)

In [38]:
data2b.head()

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate,multiplier
1,Afghanistan,2014,0.603998,2020,0.806813,6,0.202815,33.578697,5.596449,1.055964
5,Albania,2012,0.867946,2020,0.934241,8,0.066295,7.638155,0.954769,1.009548
9,Argentina,2011,0.746261,2021,0.746342,10,8.1e-05,0.010858,0.001086,1.000011
13,Armenia,2013,0.634726,2020,0.701357,7,0.066631,10.497684,1.499669,1.014997
17,Australia,2011,0.66247,2020,0.702274,9,0.039804,6.00844,0.667604,1.006676


In [39]:
data2b['multiplier'].describe()

count    76.000000
mean      1.030128
std       0.093621
min       1.000011
25%       1.005591
50%       1.011613
75%       1.024771
max       1.805550
Name: multiplier, dtype: float64

In [42]:
data2b[data2b['multiplier']==data2b['multiplier'].max()]

Unnamed: 0,country,year_start,ratio_start,year_end,ratio_end,delta_year,delta_ratio,delta_ratio_rel,change_rate,multiplier
418,Timor-Leste,2013,0.920257,2016,3.144198,3,2.22394,241.664998,80.554999,1.80555


In [43]:
# I guess we wouldn't use those with ratios > 1 at the end either 

data2b = data2b[data2b['ratio_end']<1]
data2b.shape

(66, 10)