In [186]:
import kagglehub
import numpy as np
import pandas as pd
import plotly.express as px
from great_tables import (
    GT, md, google_font, style, loc # use fonts.google.com to select fonts
)




# Download latest version
path = kagglehub.dataset_download("samithsachidanandan/the-global-ai-ml-data-science-salary-for-2025")

print("Path to dataset files:", path)

Path to dataset files: /Users/eileenlabrutto/.cache/kagglehub/datasets/samithsachidanandan/the-global-ai-ml-data-science-salary-for-2025/versions/1


In [2]:
df = pd.read_csv('/Users/eileenlabrutto/Desktop/S25DATA502/jobs_in_data.csv')
df


Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
0,2023,Data DevOps Engineer,Data Engineering,EUR,88000,95012,Germany,Mid-level,Full-time,Hybrid,Germany,L
1,2023,Data Architect,Data Architecture and Modeling,USD,186000,186000,United States,Senior,Full-time,In-person,United States,M
2,2023,Data Architect,Data Architecture and Modeling,USD,81800,81800,United States,Senior,Full-time,In-person,United States,M
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
...,...,...,...,...,...,...,...,...,...,...,...,...
9350,2021,Data Specialist,Data Management and Strategy,USD,165000,165000,United States,Senior,Full-time,Remote,United States,L
9351,2020,Data Scientist,Data Science and Research,USD,412000,412000,United States,Senior,Full-time,Remote,United States,L
9352,2021,Principal Data Scientist,Data Science and Research,USD,151000,151000,United States,Mid-level,Full-time,Remote,United States,L
9353,2020,Data Scientist,Data Science and Research,USD,105000,105000,United States,Entry-level,Full-time,Remote,United States,S


In [5]:
df_av_sal_year = (
    df[[
        'work_year',
        'salary_in_usd'
    ]]
    .groupby(by= 'work_year')
    .agg(
        {
            'salary_in_usd' : 'mean'
        }
    )
    .reset_index()
    .rename(
        columns={
            'work_year' : 'Work Year',
            'salary_in_usd' : 'Salary (US Dollars)'
        }
    )
)
df_av_sal_year

Unnamed: 0,Work Year,Salary (US Dollars)
0,2020,105878.859155
1,2021,106483.64467
2,2022,135467.501836
3,2023,155132.591708


In [None]:
fig1 = px.bar(
    df_av_sal_year, 
    x = 'Work Year', 
    y = 'Salary (US Dollars)', 
    title = '<b>Data science jobs boast increasing salaries</b>', 
    subtitle="From 2020 to 2023, average salaries in data science have increased from 106k to 155k.",
    height = 600, 
    width = 750,
    color_discrete_sequence=['#ffde57', '#4584b6'], 
    template='simple_white', 
    range_y = [100000, 160000]
)
fig1.update_layout(
    font_family = 'Bebe Neue', 
    title_font_family = 'Young Serif', 
    title_font_size = 28,
    font_color = '#646464',
    font_size = 16
)
(
fig1.update_xaxes(
    tickvals=[2020, 2021, 2022, 2023]
)
)

In [9]:
df['job_category'].unique()

array(['Data Engineering', 'Data Architecture and Modeling',
       'Data Science and Research', 'Machine Learning and AI',
       'Data Analysis', 'Leadership and Management',
       'BI and Visualization', 'Data Quality and Operations',
       'Data Management and Strategy', 'Cloud and Database'], dtype=object)

In [53]:
df_job = (
    df[[
        'work_year',
        'job_category'
    ]]
    .groupby(by='work_year')
    .value_counts()
    .reset_index()
    .rename(
        columns= {
            'work_year': 'Work Year',
            'job_category': 'Type of Job',
            'count': 'Number of Jobs'
        }
    )
)

df_job

Unnamed: 0,Work Year,Type of Job,Number of Jobs
0,2020,Data Science and Research,29
1,2020,Data Engineering,17
2,2020,Data Analysis,15
3,2020,Machine Learning and AI,10
4,2021,Data Science and Research,72
5,2021,Data Engineering,45
6,2021,Machine Learning and AI,37
7,2021,Data Analysis,28
8,2021,Leadership and Management,9
9,2021,Data Architecture and Modeling,5


In [57]:
fig2 = px.line(
    df_job, 
    x = 'Work Year', 
    y = 'Number of Jobs', 
    color = 'Type of Job', 
    height = 800
)

(
fig2.update_xaxes(
    tickvals=[2020, 2021, 2022, 2023]
)
)

In [62]:
df_job.loc[df_job['Type of Job']=='Data Science and Research', 'is_flag'] = True
df_job.loc[df_job['Type of Job']!='Data Science and Research', 'is_flag'] = False

print(df_job['is_flag'].value_counts())

display(df_job.head())


is_flag
False    27
True      4
Name: count, dtype: int64


Unnamed: 0,Work Year,Type of Job,Number of Jobs,is_flag
0,2020,Data Science and Research,29,True
1,2020,Data Engineering,17,False
2,2020,Data Analysis,15,False
3,2020,Machine Learning and AI,10,False
4,2021,Data Science and Research,72,True


In [None]:
fig3 = px.line(
    df_job, 
    x = 'Work Year', 
    y = 'Number of Jobs', 
    color = 'is_flag',  
    width = 1100, 
    height = 600,
    line_group = 'Type of Job',
    template='plotly_white', 
    color_discrete_sequence=['#ffde57', '#4584b6'], 
    title = '<b>Data science and research jobs pull up and away</b>', 
    subtitle = 'Though the amount of data science and research jobs were on par with other data science jobs in 2020, by 2021<br> its numbers were increasing, and by 2023 there was a difference of 500 jobs between it and the next closest job category.', 
)
fig3.update_layout(
    showlegend = False,
    font_family = 'Bebe Neue', 
    title_font_family = 'Young Serif', 
    title_font_size = 28, 
    margin = {'t':200},
    yaxis_title_font_size = 18, 
    xaxis_title_font_size = 18, 
    yaxis_tickfont_size = 16, 
    xaxis_tickfont_size = 16, 
    font_color = '#646464'
)
fig3.update_traces(line_width = 2.5)
fig3.update_xaxes(
    tickvals=[2020, 2021, 2022, 2023])
fig3.show()

In [93]:
df_dsr = (
    df[df['job_category']=='Data Science and Research']
    )
display(df_dsr)

Unnamed: 0,work_year,job_title,job_category,salary_currency,salary,salary_in_usd,employee_residence,experience_level,employment_type,work_setting,company_location,company_size
3,2023,Data Scientist,Data Science and Research,USD,212000,212000,United States,Senior,Full-time,In-person,United States,M
4,2023,Data Scientist,Data Science and Research,USD,93300,93300,United States,Senior,Full-time,In-person,United States,M
5,2023,Data Scientist,Data Science and Research,USD,130000,130000,United States,Senior,Full-time,Remote,United States,M
6,2023,Data Scientist,Data Science and Research,USD,100000,100000,United States,Senior,Full-time,Remote,United States,M
13,2023,Data Scientist,Data Science and Research,GBP,35000,43064,United Kingdom,Mid-level,Full-time,In-person,United Kingdom,M
...,...,...,...,...,...,...,...,...,...,...,...,...
9346,2021,Director of Data Science,Data Science and Research,USD,168000,168000,Japan,Senior,Full-time,In-person,Japan,S
9347,2021,Data Scientist,Data Science and Research,SGD,160000,119059,Singapore,Mid-level,Full-time,Remote,Israel,M
9351,2020,Data Scientist,Data Science and Research,USD,412000,412000,United States,Senior,Full-time,Remote,United States,L
9352,2021,Principal Data Scientist,Data Science and Research,USD,151000,151000,United States,Mid-level,Full-time,Remote,United States,L


In [130]:
df_dsr = (df.loc[
    df['job_category']=='Data Science and Research',
    ['work_year',
    'experience_level']
    ]
    .groupby(by='work_year')
    .value_counts()
    .reset_index()
    .rename(
        columns={
            'work_year' : 'Work Year',
            'experience_level' : 'Experience Level',
            'count' : 'Number of Jobs'
        }
    )
)

display(df_dsr)

Unnamed: 0,Work Year,Experience Level,Number of Jobs
0,2020,Mid-level,12
1,2020,Senior,8
2,2020,Entry-level,7
3,2020,Executive,2
4,2021,Mid-level,30
5,2021,Senior,20
6,2021,Entry-level,15
7,2021,Executive,7
8,2022,Senior,366
9,2022,Mid-level,89


In [149]:
df_hist = pd.DataFrame({'Work Year': ['2020', '2021', '2022', '2023'], 
                        'Entry Level': [7,15,34, 108], 
                        'Mid-level': [12,30,89,368], 
                        'Senior': [8,20,366,1876],
                        'Executive': [2,7,11,61]})

In [169]:
fig4 = px.bar(df_hist,
               x='Work Year', 
               y=df_hist.columns[1:], 
               barmode='group',
               color_discrete_sequence=['#646464', '#4584b6', '#ffde57', '#306998'],
               template='plotly_white', 
               title = '<b>Senior-level data science and research jobs in abundance</b>', 
               subtitle = 'Where entry, mid-level, and executive level jobs grew a modest amount between 2020 and 2023, <br>the number of senior-level positions multiplied expeditiously.',
                width = 1100, 
                height = 600,
               )
fig4.update_layout(
    font_family = 'Bebe Neue', 
    title_font_family = 'Young Serif', 
    title_font_size = 28,
    font_color = '#646464',
    font_size = 16,
    yaxis_title = 'Number of Jobs',
    legend_title_text = 'Job Experience Level'
)
fig4.show()

In [185]:
#comparison of 2020 entry dsr salaries vs 2023 dsr salaries
df_dsr_sal = (df.loc[
    (df['job_category']=='Data Science and Research') & (df['experience_level']=='Entry-level'),
    ['work_year',
    'salary_in_usd']
    ]
    .groupby(by='work_year')
    .agg(
        {
            'salary_in_usd': 'mean'
        }
    )
    .round(2)
    .reset_index()
    .rename(
        columns={
            'work_year' : 'Work Year',
            'salary_in_usd' : 'Salary (US Dollars)',
            'count' : 'Number of Jobs'
        }
    )
)
display(df_dsr_sal)

Unnamed: 0,Work Year,Salary (US Dollars)
0,2020,53128.57
1,2021,75623.4
2,2022,78811.44
3,2023,114576.07


In [202]:
(
    GT(df_dsr_sal)
    .tab_header(
        title = md(
            "__Entry-level salaries follow<br> growth pattern__"
        ), 
        subtitle = md(
            "Though the increase in average salary for entry-level data science and<br> research jobs from 2020-2022 was digestible, 2023 saw a leap in salary<br> of over 35k."
        )
    )
    .opt_align_table_header('left')
    .tab_style(
        style=style.text(font=google_font(name = 'Young Serif'), color='#646464'), 
        locations = loc.title()
    )
    .opt_table_font(google_font(name = 'Bebe Neue'))
    .tab_options(
        heading_title_font_size='24px', 
        heading_subtitle_font_size='14px', 
        column_labels_font_size='14px', 
        table_font_size='18px',
        data_row_padding=0.75
    )
    .data_color(
        domain=[50000,115000], 
        columns = 'Salary (US Dollars)', 
        palette=['White', '#4584b6']
    )
)

Entry-level salaries follow  growth pattern,Entry-level salaries follow  growth pattern
"Though the increase in average salary for entry-level data science and  research jobs from 2020-2022 was digestible, 2023 saw a leap in salary  of over 35k.","Though the increase in average salary for entry-level data science and  research jobs from 2020-2022 was digestible, 2023 saw a leap in salary  of over 35k."
Work Year,Salary (US Dollars)
2020,53128.57
2021,75623.4
2022,78811.44
2023,114576.07
