In [67]:
import pandas as pd
import altair as alt

In [68]:
data = pd.read_csv("DataVizCensus2020-AnonymizedResponses.csv")

In [69]:
pd.set_option("display.max_columns", None)

In [70]:
data.rename(columns={'How many years of professional experience do you have?': 'Experience',
                     'What is your yearly pay (converted to US dollars)?': 'Yearly Pay (USD)'}, inplace=True)

In [71]:
data.loc[(data['Yearly Pay (USD)'] == "Less than $20,000"), 'Yearly Pay (USD)'] = 'Less than $40,000'
data.loc[(data['Yearly Pay (USD)'] == "$20,000 - $39,000"), 'Yearly Pay (USD)'] = 'Less than $40,000'
data.loc[(data['Yearly Pay (USD)'] == "$40,000 - $59,000"), 'Yearly Pay (USD)'] = '$40,000-$79,000'
data.loc[(data['Yearly Pay (USD)'] == "$60,000 - $79,000"), 'Yearly Pay (USD)'] = '$40,000-$79,000'
data.loc[(data['Yearly Pay (USD)'] == "$80,000 - $99,000"), 'Yearly Pay (USD)'] = '$80,000-$119,000'
data.loc[(data['Yearly Pay (USD)'] == "$100,000 - $119,000"), 'Yearly Pay (USD)'] = '$80,000-$119,000'
data.loc[(data['Yearly Pay (USD)'] == "$120,000 - $139,000"), 'Yearly Pay (USD)'] = '$120,000-$179,000'
data.loc[(data['Yearly Pay (USD)'] == "$140,000 - $159,000"), 'Yearly Pay (USD)'] = '$120,000-$179,000'
data.loc[(data['Yearly Pay (USD)'] == "$160,000 - $179,000"), 'Yearly Pay (USD)'] = '$120,000-$179,000'
data.loc[(data['Yearly Pay (USD)'] == "$180,000 - $199,000"), 'Yearly Pay (USD)'] = 'More than $180,000'
data.loc[(data['Yearly Pay (USD)'] == "$200,000 - $219,000"), 'Yearly Pay (USD)'] = 'More than $180,000'
data.loc[(data['Yearly Pay (USD)'] == "$220,000 - $239,000"), 'Yearly Pay (USD)'] = 'More than $180,000'
data.loc[(data['Yearly Pay (USD)'] == "More than $240,000"), 'Yearly Pay (USD)'] = 'More than $180,000'

In [72]:
data.loc[(data['Experience'] == '21-25') | (data.Experience == '26-30'), 'Experience'] = '21-30'

In [73]:
data.loc[(data['Yearly Pay (USD)']=='Less than $40,000'), 'order'] = 0
data.loc[(data['Yearly Pay (USD)']=='$40,000-$79,000'), 'order'] = 1
data.loc[(data['Yearly Pay (USD)']=='$80,000-$119,000'), 'order'] = 2
data.loc[(data['Yearly Pay (USD)']=='$120,000-$179,000'), 'order'] = 3
data.loc[(data['Yearly Pay (USD)']=='More than $180,000'), 'order'] = 4

In [74]:
data.rename(columns={'gender_collapsed': 'Gender'}, inplace=True)

In [151]:
%%html
<style>
@import url('https://fonts.googleapis.com/css2?family=Jost&display=swap');
</style>

In [152]:
def contest_theme():
    return {
        'config': {
            'view': {
                'height': 300,
                'width': 250,
            },
            'mark': {
                'color': 'black',
                'fill': 'black'
            },
            'font': 'Jost'
        }
    }
# register the custom theme under a chosen name
alt.themes.register('contest_theme', contest_theme)
# enable the newly registered theme
alt.themes.enable('contest_theme')

ThemeRegistry.enable('contest_theme')

In [76]:
domain = ['Less than $40,000',
          '$40,000-$79,000',
          '$80,000-$119,000',
          '$120,000-$179,000',
          'More than $180,000']
colors = ['#afff6a', '#8dd951', '#6cb337', '#4c8f1e', '#2d6d00']

In [161]:
data.loc[data.Gender == 'Woman', 'Gender'] = 'Woman, NB/Trans'
data.loc[data.Gender == 'NB/Trans', 'Gender'] = 'Woman, NB/Trans'

In [162]:
def make_chart(data, f=None):
    groups = ['Experience', 'order', 'Yearly Pay (USD)']
    t = ['Experience', 'Yearly Pay (USD)', 'sum(respondents)', alt.Tooltip('Percentage of respondents', format='.1%')]
    denom = ['Experience']
    if f:
        groups.append(f)
        t = [f] + t
        denom.append(f)
    summary = data.groupby(groups).size().reset_index().rename(
        columns={0: 'respondents'})
    pct = (data.groupby(groups).size() / data.groupby(denom).size()).reset_index().rename(
            columns={0: 'Percentage of respondents'})
    summary = summary.merge(pct, on=groups)
    x_axis = alt.X('Experience', sort=['5 or less', '5-10', '11-15', '16-20', '21-30'])
    y_axis = alt.Y('sum(respondents)', stack="normalize", axis=alt.Axis(format='%'), title='Percentage of respondents')
    color_legend = alt.Color('Yearly Pay (USD)',
                             scale=alt.Scale(domain=domain, range=colors),
                             sort=domain)
    if f:
        return alt.Chart(summary[summary['Yearly Pay (USD)'].notnull()]).mark_bar(size=35).encode(
            x=x_axis,
            y=y_axis,
            color=color_legend,
            order=alt.Order('order'),
            tooltip=t,
            facet=f
        )
    else:
        return alt.Chart(summary[summary['Yearly Pay (USD)'].notnull()]).mark_bar(size=35).encode(
            x=x_axis,
            y=y_axis,
            color=color_legend,
            order=alt.Order('order'),
            tooltip=t
        )

In [165]:
chart = make_chart(data)

In [166]:
chart.save('overall.html')

In [167]:
gender = make_chart(data, f='Gender')

In [168]:
gender.save('gender.html')

In [169]:
underrep = make_chart(data, f='Do you consider yourself a member of a historically disadvantaged or underrepresented racial or ethnic group?')

In [170]:
underrep.save('underrep.html')

In [158]:
education_dropped = data.loc[data.apply(
    lambda x: x['What is the highest level of education you have completed?'] in ['Bachelors Degree',
                                                                                 'Masters Degree (or similar graduate degree)',
                                                                                 'Ph.D.'], axis=1)]

In [171]:
education = make_chart(education_dropped, f='What is the highest level of education you have completed?')

In [172]:
education.save('education.html')

# Making Gain$ in Data Viz

Respondents with more years of professional experience tend to have higher salaries, but how does that trend vary across other factors?

In [173]:
chart

### By Gender

We similar a similar pattern amongst male respondents. However, women and nonbinary/trans respondents do not see the same salary gains with experience as their male counterparts.

In [174]:
gender

### Being from a historically disadvantaged or underrepresented racial or ethnic group

Salary gains are similar across respondents who are and are not members of a historically disadvantaged or underrepresented racial or ethnic group. However, salary gains are somewhat less consistent for those who answered 'Yes'. This can be due, in part, to the survey having notably fewer respondents who answered 'Yes'.

In [175]:
underrep

### By level of education

Respondents with more advanced degrees do not necessarily earn more money with experience. (Note: Many education categories were dropped due to sample size.)

In [176]:
education