This exercise uses the dataset from kaggle: https://www.kaggle.com/datasets/asaniczka/gender-wage-gap-in-the-usa-1973-2022, which examines wage gaps in the USA between 1973-2022. This dataset was pulled from https://www.epi.org/data/#?subject=wagegap-bw, which provides comprehensive and accurate information about the American labor force. I will be examining pay differences over time between genders in tandem with racial factors. My program will provide visualizations with results, including bar charts, line charts, and scatter plots.

In [992]:
#Bring in necessary packages/libraries
import pandas as pd
import plotly.express as px
import numpy as np


In [993]:
#Package the dataset into a dataframe
wages = pd.read_csv('gender_wage_gap.csv')


In [994]:
wages.sort_values(by='year')


Unnamed: 0,year,men_median,men_average,women_median,women_average,white_men_median,white_men_average,black_men_median,black_men_average,hispanic_men_median,hispanic_men_average,white_women_median,white_women_average,black_women_median,black_women_average,hispanic_women_median,hispanic_women_average
49,1973,24.0,26.96,15.1,17.31,24.98,27.93,19.29,21.09,18.67,20.67,15.36,17.57,13.38,15.83,13.78,15.39
48,1974,23.7,26.48,14.88,17.01,24.55,27.34,19.02,20.84,18.6,21.14,15.22,17.23,13.46,15.68,13.26,15.42
47,1975,24.08,26.46,15.08,17.24,24.68,27.37,19.15,20.6,18.43,20.39,15.32,17.45,13.41,16.14,13.5,15.07
46,1976,23.69,26.73,15.22,17.64,24.32,27.54,19.19,21.57,18.56,21.07,15.42,17.79,13.97,16.73,13.77,15.84
45,1977,23.92,26.73,15.11,17.44,24.94,27.66,18.7,20.84,18.61,21.53,15.33,17.57,14.06,16.91,13.25,15.45
44,1978,24.27,26.86,15.17,17.5,25.08,27.81,19.74,21.38,18.34,21.01,15.4,17.68,14.0,16.76,13.74,15.29
43,1979,24.11,27.08,15.28,17.73,25.51,28.03,19.15,22.0,18.83,21.56,15.47,17.91,14.46,16.95,13.53,15.82
42,1980,23.88,26.54,15.21,17.54,25.08,27.51,18.7,21.4,18.03,21.05,15.41,17.71,14.12,16.85,13.64,15.83
41,1981,23.38,26.38,15.29,17.55,24.6,27.37,18.9,21.31,17.85,20.78,15.41,17.73,14.36,16.78,13.55,15.68
40,1982,23.39,26.6,15.12,17.85,24.6,27.65,18.24,20.97,17.76,20.71,15.44,18.06,14.44,16.8,13.94,16.02


In [995]:
# Look at changes over time year/demographic of interest
def wage_over_time(data,demo):
    demo_line = px.line(data,x='year',y=demo)
    demo_line.update_layout(xaxis_title='Year', yaxis_title=f'{demo} wage (hourly)')
    return demo_line.show()


In [996]:
wage_over_time(wages,'women_average')


In [997]:
wage_over_time(wages,'hispanic_women_average')


In [998]:
wage_over_time(wages,'white_men_average')


In [64]:
#Lets look at income difference (in dollars) between any 2 groups during a particular year


In [999]:
def income_diff(data,year,column1,column2):
    income_year = data.loc[data['year']==year] #filter and create new dataframe (input year only)
    difference = income_year[column1]-income_year[column2]
    for i in difference:
        return round(i,2)
    

In [1000]:
income_diff(wages,1973,'men_average','women_average')
#print(income_diff(wages,1990,'white_women_average','black_women_average'))
#print(income_diff(wages,1999,'white_men_average','black_men_average'))


9.65

In [1001]:
(income_diff(wages,1980,'white_men_average','black_women_average'))


10.66

In [1002]:
#looking at wage differences between specified time period
# for the purposes of this example, looking between 2010-2020
wages_2010on = wages.loc[(wages['year'] >2009) & (wages['year']<2021)]

# ****will create a function later to handle any time period


In [1003]:
wages_2010on.sort_values(by='year')


Unnamed: 0,year,men_median,men_average,women_median,women_average,white_men_median,white_men_average,black_men_median,black_men_average,hispanic_men_median,hispanic_men_average,white_women_median,white_women_average,black_women_median,black_women_average,hispanic_women_median,hispanic_women_average
12,2010,23.34,30.85,19.47,24.68,26.08,33.69,18.99,23.72,16.93,22.39,20.32,25.92,17.45,21.9,14.95,19.38
11,2011,22.76,29.99,19.34,24.34,25.24,32.73,18.35,23.26,16.44,21.42,20.34,25.64,16.98,21.55,15.11,19.18
10,2012,22.87,30.45,19.01,24.16,25.46,33.34,18.02,23.57,16.64,21.58,20.28,25.58,16.52,21.29,14.79,18.95
9,2013,22.63,30.49,18.87,24.38,25.19,33.42,18.73,23.45,16.45,21.82,20.24,25.79,16.89,21.42,15.01,19.05
8,2014,22.43,30.08,18.63,24.45,24.89,32.89,18.0,23.02,17.26,21.96,20.12,25.87,16.32,21.46,14.83,19.17
7,2015,23.19,31.52,19.07,25.11,25.73,34.57,18.4,24.0,17.78,22.97,20.77,26.66,16.64,21.65,14.93,19.74
6,2016,23.44,32.18,19.44,25.58,26.34,35.12,18.42,24.53,18.3,23.9,21.14,27.22,17.13,21.86,15.72,20.18
5,2017,23.78,32.27,19.69,25.91,26.36,35.3,18.15,24.27,18.45,24.05,21.48,27.71,17.22,21.81,15.75,20.54
4,2018,23.42,33.19,19.73,26.21,26.79,36.55,18.66,24.67,18.77,24.55,21.75,28.01,17.34,22.55,16.24,20.6
3,2019,24.04,33.65,20.42,26.83,27.39,36.84,19.31,25.18,19.39,25.08,22.01,28.41,18.08,23.17,17.08,21.5


In [635]:
import regex as re

#use regex to look at averages among men and women.
#We know that the respective columns will at least contain 'wo/men' and 'average'
#Instead of listing them all out (4 each), we can write a regular expression to find the columns containing the
#gender averages


In [1004]:
#ladies first
women = '.*?wo.*age$' 


In [1005]:
#function to take in the regex pattern and return columns of interest


In [1006]:
def take_regex(dataset,pattern):
    demo_avg = []
    for i in dataset:
        gender_avg =re.findall(pattern,i)
        for person in gender_avg:
            if len(person) > 0: #some empties were returned i.e the median and male categories
                demo_avg.append(person)
    return demo_avg


In [1007]:
ladies_avg = take_regex(wages_2010on,women)


In [1008]:
#sanity check
ladies_avg


['women_average',
 'white_women_average',
 'black_women_average',
 'hispanic_women_average']

In [1009]:
def demo_line(data1,data_demo):
    d_line = px.line(data1,x='year',y=[i for i in data_demo])
    d_line.update_layout(xaxis_title='Year', yaxis_title = 'Hourly Wage')
    return d_line.show()


In [1010]:
demo_line(wages_2010on,all_w_avg)


In [908]:
men = '.*?(?<!wo)men.*age$' 


In [1011]:
fellas_avg = take_regex(wages_2010on,men)


In [1012]:
demo_line(wages_2010on,fellas_avg)


In [911]:
#lets look at all races now


In [912]:
race_patt = '^[bwh][^o].*[mw].*av.*'


In [913]:
all_race_avg = take_regex(wages_2010on,race_patt)


In [1013]:
demo_line(wages_2010on,all_race_avg)


In [1014]:
#function to handle time intervals [looking at data between a specified timeframe]
def wages_over_time(data,year1,year2):
    new_wages = data.loc[(data['year'] >=year1) & (data['year']<=year2)]

    return new_wages.sort_values(by='year')


In [1015]:
wot_7378 = wages_over_time(wages,1973,1978)


In [1016]:
wot_7378


Unnamed: 0,year,men_median,men_average,women_median,women_average,white_men_median,white_men_average,black_men_median,black_men_average,hispanic_men_median,hispanic_men_average,white_women_median,white_women_average,black_women_median,black_women_average,hispanic_women_median,hispanic_women_average
49,1973,24.0,26.96,15.1,17.31,24.98,27.93,19.29,21.09,18.67,20.67,15.36,17.57,13.38,15.83,13.78,15.39
48,1974,23.7,26.48,14.88,17.01,24.55,27.34,19.02,20.84,18.6,21.14,15.22,17.23,13.46,15.68,13.26,15.42
47,1975,24.08,26.46,15.08,17.24,24.68,27.37,19.15,20.6,18.43,20.39,15.32,17.45,13.41,16.14,13.5,15.07
46,1976,23.69,26.73,15.22,17.64,24.32,27.54,19.19,21.57,18.56,21.07,15.42,17.79,13.97,16.73,13.77,15.84
45,1977,23.92,26.73,15.11,17.44,24.94,27.66,18.7,20.84,18.61,21.53,15.33,17.57,14.06,16.91,13.25,15.45
44,1978,24.27,26.86,15.17,17.5,25.08,27.81,19.74,21.38,18.34,21.01,15.4,17.68,14.0,16.76,13.74,15.29


In [1017]:
#I want to calculate the mean of all median series between the timeframe (1973-1978)
#We can use the previous regex function to help with that
# function accepts a dataset and pattern to match on
all_meds = take_regex(wot_7378,'.*med.*')


In [1018]:
all_meds


['men_median',
 'women_median',
 'white_men_median',
 'black_men_median',
 'hispanic_men_median',
 'white_women_median',
 'black_women_median',
 'hispanic_women_median']

In [1019]:
#Create a new dataframe with those columns only in order to calculate mean for each of the columns
meds_7378 = wot_7378[all_meds]
median_avgs = []
for i in meds_7378:
    median_avgs.append(round(np.mean(meds_7378[i]),2))
    

In [1020]:
meds_7378


Unnamed: 0,men_median,women_median,white_men_median,black_men_median,hispanic_men_median,white_women_median,black_women_median,hispanic_women_median
49,24.0,15.1,24.98,19.29,18.67,15.36,13.38,13.78
48,23.7,14.88,24.55,19.02,18.6,15.22,13.46,13.26
47,24.08,15.08,24.68,19.15,18.43,15.32,13.41,13.5
46,23.69,15.22,24.32,19.19,18.56,15.42,13.97,13.77
45,23.92,15.11,24.94,18.7,18.61,15.33,14.06,13.25
44,24.27,15.17,25.08,19.74,18.34,15.4,14.0,13.74


In [1021]:
median_avgs


[23.94, 15.09, 24.76, 19.18, 18.54, 15.34, 13.71, 13.55]

In [775]:
#p = list(zip(all_meds, median_avgs))


In [1022]:
#Create a dataframe containing the median categories and the actual values
#Prepping for a bar chart**
meds = pd.DataFrame({'Median_avgs':median_avgs},index=all_meds)


In [803]:
med_bar = px.bar(meds, y='Median_avgs',color = all_meds,color_discrete_sequence=px.colors.qualitative.Bold,title= 'Median averages (hourly pay) between 1973-1978')
med_bar.update_layout(xaxis_title='Median Categories', yaxis_title = 'Median Pay (avg)')
med_bar.show()

In [804]:
#lets create a function to look at multiple instances of the previous example


In [1023]:
wage_8090 = wages_over_time(wages,1980,1990)

#the next important item is figuring out what groups we want to look at


In [1024]:
wage_8090


Unnamed: 0,year,men_median,men_average,women_median,women_average,white_men_median,white_men_average,black_men_median,black_men_average,hispanic_men_median,hispanic_men_average,white_women_median,white_women_average,black_women_median,black_women_average,hispanic_women_median,hispanic_women_average
42,1980,23.88,26.54,15.21,17.54,25.08,27.51,18.7,21.4,18.03,21.05,15.41,17.71,14.12,16.85,13.64,15.83
41,1981,23.38,26.38,15.29,17.55,24.6,27.37,18.9,21.31,17.85,20.78,15.41,17.73,14.36,16.78,13.55,15.68
40,1982,23.39,26.6,15.12,17.85,24.6,27.65,18.24,20.97,17.76,20.71,15.44,18.06,14.44,16.8,13.94,16.02
39,1983,22.96,26.63,15.4,17.97,24.41,27.67,17.98,21.26,17.26,20.69,15.67,18.19,14.23,16.95,13.76,15.96
38,1984,23.12,26.63,15.66,18.13,24.17,27.72,18.21,21.16,17.33,20.7,15.95,18.4,14.2,16.93,13.58,15.91
37,1985,23.23,26.88,15.77,18.42,24.79,28.18,17.9,20.86,17.15,20.63,16.02,18.7,14.62,17.31,13.64,16.23
36,1986,23.47,27.56,16.05,18.9,25.36,28.94,18.24,21.41,17.67,20.93,16.36,19.21,15.04,17.57,13.8,16.56
35,1987,23.55,27.65,16.25,19.13,24.72,29.11,18.32,21.57,17.31,20.61,16.81,19.49,14.98,17.77,14.12,16.66
34,1988,23.61,27.82,16.59,19.38,24.1,29.34,18.18,21.97,16.86,20.49,16.92,19.77,14.98,18.02,14.08,16.51
33,1989,22.8,26.66,16.49,19.4,24.09,28.16,17.52,20.66,16.44,20.04,17.01,19.81,15.13,17.94,13.8,16.54


In [1025]:
#looking at black men and black women (average)
blacks = take_regex(wage_8090,'^bl.*rage$')


In [1026]:
blacks


['black_men_average', 'black_women_average']

In [839]:
def demo_years(data,demo):
    info = []
    demos_year = data[demo]
    for i in demos_year:
        info.append(round(np.mean(demos_year[i]),2))
    df = pd.DataFrame({'Average hourly rate between 1980-1990':info},index=demo)
    return df


In [851]:
blk_8090 = demo_years(wage_8090,blacks)


In [895]:
def demoBar(dataset,demo,year1,year2):
    bar = px.bar(dataset, y=[i for i in dataset],color = demo,color_discrete_sequence=px.colors.qualitative.Bold,
                title = f'Cumulative hourly pay for {demo[0]} and {demo[1]} between {year1} and {year2}')
    bar.update_layout(xaxis_title='Race/Gender Categories', yaxis_title = 'Hourly Pay (Avg)')
    return bar.show()


In [1048]:
demoBar(blk_8090,blacks,1990,2000)


In [None]:
bar = px.bar(blk_8090, y=[i for i in blacks],color = demo,color_discrete_sequence=px.colors.qualitative.Bold,
title = f'Cumulative hourly pay for {demo[0]} and {demo[1]} between 1990 and 2000')
bar.update_layout(xaxis_title='Race/Gender Categories', yaxis_title = 'Hourly Pay (Avg)')
bar.show()

In [1029]:
def genderdiff_wages(data,list_of_cols):
    gw_line = px.line(data,x='year',y=list_of_cols)
    gw_line.update_layout(
     xaxis = dict(
        tickmode = 'linear',
        dtick = 0))
    return gw_line.update_layout(xaxis_title='Year', yaxis_title='Salary')


In [1030]:
#Call a function within a function-- looking between a specified timeframe and a list of columns
#We can look at a few examples
genderdiff_wages(wages_over_time(wages,2012,2022),['white_men_average','white_women_average'])


In [1031]:
genderdiff_wages(wages_over_time(wages,2000,2015),['black_men_average','black_women_average'])


In [1032]:
genderdiff_wages(wages_over_time(wages,2020,2022),['white_women_average','hispanic_women_average'])


In [1033]:
#lets look at the differences between groups in a specified year


In [934]:
def wages_by_year(data,year,col_list):
    wage_year = data.loc[data['year']==year]
    wage_y = px.scatter(wage_year,x='year',y=col_list,range_x=[year-1,year+1])
    wage_y.update_traces(marker_size=12)
    wage_y.update_layout(xaxis_title='Year', yaxis_title='Hourly Wage',
        xaxis = dict(
        tickmode = 'linear',
        tick0 = year,
        dtick = 0))
    return wage_y


In [1034]:
wages_by_year(wages,2022,['men_average','women_average'])


In [1035]:
wages_by_year(wages,2000,[i for i in all_race_avg])


In [1036]:
wages_by_year(wages,2022,[i for i in all_race_avg])


You can see the gap spacing from 2000-2022 between [white women and black men] and [black women and hispanic men]. Meanwhile, white men and hispanic women remained the highest and lowest (respectively) in average hourly wage for each time period.

Lets calculate the wage gap.
According to https://www.epi.org/data/#?subject=wagegap-mf, 
'The gender wage gap is the percent by which hourly wages of female workers are less than hourly wages of 
male workers. It is also often expressed as a wage ratio (women’s share of men’s wages) by subtracting the 
gap from 100 percent.'

In [936]:
#Lets try this out first and then create a function


In [1037]:
gen_gap = wages[['year','men_average','women_average']].sort_values(by='year')


In [1038]:
gen_gap


Unnamed: 0,year,men_average,women_average
49,1973,26.96,17.31
48,1974,26.48,17.01
47,1975,26.46,17.24
46,1976,26.73,17.64
45,1977,26.73,17.44
44,1978,26.86,17.5
43,1979,27.08,17.73
42,1980,26.54,17.54
41,1981,26.38,17.55
40,1982,26.6,17.85


In [1039]:
gen_gap['wage_gap (%)'] = round(100 - (gen_gap['women_average']/gen_gap['men_average'])*100,1)


In [1040]:
gen_gap
#reference: https://www.epi.org/data/#/?subject=wagegap-mf

Unnamed: 0,year,men_average,women_average,wage_gap (%)
49,1973,26.96,17.31,35.8
48,1974,26.48,17.01,35.8
47,1975,26.46,17.24,34.8
46,1976,26.73,17.64,34.0
45,1977,26.73,17.44,34.8
44,1978,26.86,17.5,34.8
43,1979,27.08,17.73,34.5
42,1980,26.54,17.54,33.9
41,1981,26.38,17.55,33.5
40,1982,26.6,17.85,32.9


In [1041]:
px.scatter(gen_gap, x='year', y='wage_gap (%)',title='Wage gap between men and women from 1973-2022')
#huge drop from 1988-1989 (3% dip)


In [1043]:
race_gap = wages[['year','white_men_average','black_men_average']].sort_values(by='year')


In [1044]:
race_gap


Unnamed: 0,year,white_men_average,black_men_average
49,1973,27.93,21.09
48,1974,27.34,20.84
47,1975,27.37,20.6
46,1976,27.54,21.57
45,1977,27.66,20.84
44,1978,27.81,21.38
43,1979,28.03,22.0
42,1980,27.51,21.4
41,1981,27.37,21.31
40,1982,27.65,20.97


In [1045]:
race_gap['wage_gap (%)'] = round(100 - (race_gap['black_men_average']/race_gap['white_men_average'])*100,1)


In [1046]:
race_gap


Unnamed: 0,year,white_men_average,black_men_average,wage_gap (%)
49,1973,27.93,21.09,24.5
48,1974,27.34,20.84,23.8
47,1975,27.37,20.6,24.7
46,1976,27.54,21.57,21.7
45,1977,27.66,20.84,24.7
44,1978,27.81,21.38,23.1
43,1979,28.03,22.0,21.5
42,1980,27.51,21.4,22.2
41,1981,27.37,21.31,22.1
40,1982,27.65,20.97,24.2


In [1047]:
px.scatter(race_gap, x='year', y='wage_gap (%)',title='Wage gap between white men and black men from 1973-2022')