In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pycountry
from plotly.subplots import make_subplots
pd.set_option('display.max_rows', 500)

In [2]:
df = pd.read_csv('ds_salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [4]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [5]:
numerical_columns = df.select_dtypes(include=['int64', 'float64'])
categorical_columns = df.select_dtypes(include=['object'])

display(numerical_columns)
display(categorical_columns)

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
0,2023,80000,85847,100
1,2023,30000,30000,100
2,2023,25500,25500,100
3,2023,175000,175000,100
4,2023,120000,120000,100
...,...,...,...,...
3750,2020,412000,412000,100
3751,2021,151000,151000,100
3752,2020,105000,105000,100
3753,2020,100000,100000,100


Unnamed: 0,experience_level,employment_type,job_title,salary_currency,employee_residence,company_location,company_size
0,SE,FT,Principal Data Scientist,EUR,ES,ES,L
1,MI,CT,ML Engineer,USD,US,US,S
2,MI,CT,ML Engineer,USD,US,US,S
3,SE,FT,Data Scientist,USD,CA,CA,M
4,SE,FT,Data Scientist,USD,CA,CA,M
...,...,...,...,...,...,...,...
3750,SE,FT,Data Scientist,USD,US,US,L
3751,MI,FT,Principal Data Scientist,USD,US,US,L
3752,EN,FT,Data Scientist,USD,US,US,S
3753,EN,CT,Business Data Analyst,USD,US,US,L


In [6]:
df['work_year'] = df['work_year'].apply(str)
#df['remote_ratio'] = df['remote_ratio'].apply(str)

df['experience_level'].replace({
    'SE':'Senior Level',
    'MI':'Intermediate Level',
    'EN':'Entry Level',
    'EX':'Executive Level',
}, inplace=True)

df['employment_type'].replace({
    'FT':'Full-Time',
    'CT':'Contract',
    'PT':'Part-Time',
    'FL':'Freelance',
}, inplace=True);

df.drop(columns=['salary', 'salary_currency'], inplace=True)

# Create a dictionary mapping ISO-2 to ISO-3 country codes
iso2_to_iso3 = {country.alpha_2: country.alpha_3 for country in pycountry.countries}

# Map ISO-2 country codes to ISO-3 country codes in the DataFrame
df['company_location'] = df['company_location'].map(iso2_to_iso3)
df['employee_residence'] = df['employee_residence'].map(iso2_to_iso3)

df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior Level,Full-Time,Principal Data Scientist,85847,ESP,100,ESP,L
1,2023,Intermediate Level,Contract,ML Engineer,30000,USA,100,USA,S
2,2023,Intermediate Level,Contract,ML Engineer,25500,USA,100,USA,S
3,2023,Senior Level,Full-Time,Data Scientist,175000,CAN,100,CAN,M
4,2023,Senior Level,Full-Time,Data Scientist,120000,CAN,100,CAN,M
...,...,...,...,...,...,...,...,...,...
3750,2020,Senior Level,Full-Time,Data Scientist,412000,USA,100,USA,L
3751,2021,Intermediate Level,Full-Time,Principal Data Scientist,151000,USA,100,USA,L
3752,2020,Entry Level,Full-Time,Data Scientist,105000,USA,100,USA,S
3753,2020,Entry Level,Contract,Business Data Analyst,100000,USA,100,USA,L


In [7]:
df['job_title'].describe()

count              3755
unique               93
top       Data Engineer
freq               1040
Name: job_title, dtype: object

There are 3755 job entries and 93 different job titles

In [8]:
job_titles = df['job_title'].value_counts() 
filtered_job_titles = job_titles[job_titles >30]
filtered_job_titles = filtered_job_titles.reset_index().rename(columns={'job_title': 'count', 'index': 'job_title', })

In [9]:
top_jobs_df = df[df['job_title'].isin(filtered_job_titles['job_title'])]
top_jobs_df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
1,2023,Intermediate Level,Contract,ML Engineer,30000,USA,100,USA,S
2,2023,Intermediate Level,Contract,ML Engineer,25500,USA,100,USA,S
3,2023,Senior Level,Full-Time,Data Scientist,175000,CAN,100,CAN,M
4,2023,Senior Level,Full-Time,Data Scientist,120000,CAN,100,CAN,M
5,2023,Senior Level,Full-Time,Applied Scientist,222200,USA,0,USA,L
...,...,...,...,...,...,...,...,...,...
3746,2021,Intermediate Level,Full-Time,Data Scientist,119059,SGP,100,ISR,M
3748,2021,Intermediate Level,Full-Time,Data Engineer,28369,MLT,50,MLT,L
3750,2020,Senior Level,Full-Time,Data Scientist,412000,USA,100,USA,L
3752,2020,Entry Level,Full-Time,Data Scientist,105000,USA,100,USA,S


## Lets Get Started with the Top 5 Job Titles Around the World

In [10]:
top_jobs = top_jobs_df['job_title'].value_counts().sort_values(ascending=True)
top_jobs = top_jobs[5:]

fig = go.Figure(go.Bar(
    y=top_jobs.index,
    x=top_jobs.values,
    orientation='h'
))
fig.update_layout(
    title='From 2020 to 2023 the most demanded job is Data Engineer',
    xaxis_title='Job Listings',
    margin=dict(l=200, r=400, t=50, b=50)
)

fig.show()

## Changes in Salaries over the years

In [11]:
salaries_years = top_jobs_df.groupby(['job_title', 'work_year']).agg({'salary_in_usd':'mean'}).reset_index()
salaries_years_filtered = salaries_years[salaries_years['job_title'].isin(top_jobs.index)]
salaries_years_filtered

Unnamed: 0,job_title,work_year,salary_in_usd
0,Analytics Engineer,2022,137969.807018
1,Analytics Engineer,2023,170210.652174
4,Data Analyst,2020,42705.0
5,Data Analyst,2021,75024.952381
6,Data Analyst,2022,107207.398551
7,Data Analyst,2023,114097.47557
8,Data Architect,2021,166666.666667
9,Data Architect,2022,166091.543478
10,Data Architect,2023,157555.384615
11,Data Engineer,2020,75726.933333


In [12]:
salaries_years = top_jobs_df.groupby(['job_title', 'work_year']).agg({'salary_in_usd':'mean'}).reset_index()
salaries_years_filtered = salaries_years[salaries_years['job_title'].isin(top_jobs.index)]

# Create a separate trace for each job title
traces = []
job_titles = salaries_years_filtered['job_title'].unique()

for title in job_titles:
    data = salaries_years_filtered[salaries_years_filtered['job_title'] == title]
    trace = go.Scatter(
        x=salaries_years_filtered['work_year'],
        y=salaries_years_filtered['salary_in_usd'],
        mode='lines',
        name=title
    )
    traces.append(trace)

# Create the area graph
fig = go.Figure(data=traces)

# Customize the graph layout
fig.update_layout(
    title='Trend of Average Salaries by Job Title',
    xaxis_title='Year',
    yaxis_title='Average Salary',
    showlegend=True,
    hovermode='x',
    template='plotly_dark'
)

# Show the graph
fig.show()

In [13]:
filtered = top_jobs_df.groupby(['company_location'])['salary_in_usd'].mean().reset_index()
filtered = filtered.sort_values(by='salary_in_usd', ascending=False)
fig = go.Figure(data=(go.Bar(
    x=filtered['company_location'],
    y=filtered['salary_in_usd'],
)))

fig.update_layout(
    title='Average Salary for Top Jobs',
    xaxis_title='Job Title',
    yaxis_title='Salary (USD)',
    yaxis=dict(title='Salary (USD)', tickprefix='$'),
    margin=dict(l=50, r=50, t=50, b=50)
)
fig.show()

In [14]:
grouped_data = top_jobs_df.groupby('company_location').agg({'salary_in_usd': 'mean', 'job_title': 'count'}).reset_index()

# Sort the data by average salary in descending order
sorted_data = grouped_data.sort_values('salary_in_usd', ascending=False)

fig = go.Figure()

# Add the bar trace for average salary
fig.add_trace(go.Bar(
    x=sorted_data['company_location'],
    y=sorted_data['salary_in_usd'],
    name='Average Salary',
    marker_color='green',
    yaxis='y1'
))

# Add the line trace for job title count
fig.add_trace(go.Scatter(
    x=sorted_data['company_location'],
    y=sorted_data['job_title'],
    mode='lines+markers',
    name='Job Title Count',
    marker_color='red',
    yaxis='y2'
))

fig.update_layout(
    title='Average Salary and Job Title Count by Company Location',
    xaxis_title='Company Location',
    yaxis=dict(title='Average Salary (USD)', tickprefix='$', side='left', showgrid=False),
    yaxis2=dict(title='Job Title Count', side='right', overlaying='y', showgrid=False),
    margin=dict(l=50, r=50, t=50, b=50),
    barmode='group',
    showlegend=True
)

fig.show()

In [15]:
experience_df = top_jobs_df.groupby(['experience_level','company_size']).agg({'salary_in_usd':'mean'}).reset_index()

fig = go.Figure()

category_order = ['Entry Level', 'Intermediate Level', 'Senior Level', 'Executive Level']
colors = {'S': 'gray', 'M': 'blue', 'L': 'gray'}

for company_size, color in colors.items():
    filtered_data = experience_df[experience_df['company_size'] == company_size]
    fig.add_trace(go.Bar(
        x=filtered_data['experience_level'],
        y=filtered_data['salary_in_usd'],
        name=company_size,
        marker=dict(color=color),
    ))

    fig.update_layout(
    title='Among All Experience Levels, Salaries offered by Medium Companies are on Average higher than Small and Large Companies',
    xaxis_title='Experience Level',
    yaxis_title='Salary (USD)',
    showlegend=False,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    xaxis=dict(categoryorder='array', categoryarray=category_order),
    yaxis=dict(showgrid=False),
    title_x=0.1,
    )

fig.add_annotation(
    text='Medium Size Company',
    x='Entry Level',
    y=100000,
)
fig.add_annotation(
    text='Medium Size Company',
    x='Intermediate Level',
    y=114000,
)
fig.add_annotation(
    text='Medium Size Company',
    x='Senior Level',
    y=155000,
)
fig.add_annotation(
    text='Medium Size Company',
    x='Executive Level',
    y=200000,
    
)

fig.show()


In [16]:
job_counts = df.groupby(['work_year', 'job_title']).size().reset_index(name='count')

# Sort the data by count in descending order
job_counts_sorted = job_counts.sort_values(['work_year', 'count'], ascending=[True, False])

# Get the top 5 job titles for each year
top_jobs_per_year = job_counts_sorted.groupby('work_year').head(5)

# Print the result
top_jobs_per_year


Unnamed: 0,work_year,job_title,count
9,2020,Data Scientist,21
6,2020,Data Engineer,15
5,2020,Data Analyst,8
15,2020,Machine Learning Engineer,4
2,2020,Big Data Engineer,3
45,2021,Data Scientist,44
40,2021,Data Engineer,38
36,2021,Data Analyst,21
56,2021,Machine Learning Engineer,18
66,2021,Research Scientist,10


In [17]:
tj2020 = top_jobs_per_year[top_jobs_per_year['work_year'] == '2020']
tj2021 = top_jobs_per_year[top_jobs_per_year['work_year'] == '2021']
tj2022 = top_jobs_per_year[top_jobs_per_year['work_year'] == '2022']
tj2023 = top_jobs_per_year[top_jobs_per_year['work_year'] == '2023']


fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Top 5 Jobs in 2020', 
                                    'Top 5 Jobs in 2021',
                                    'Top 5 Jobs in 2022',
                                    'Top 5 Jobs in 2023'),
                    )


fig.add_trace(go.Bar(
    x=tj2020['job_title'],
    y=tj2020['count'],
    orientation='v',
    marker=dict(color=['green', 'purple', 'gray','gray','gray'])),
    row=1, col=1,
    
)

fig.add_trace(go.Bar(
    x=tj2021['job_title'],
    y=tj2021['count'],
    orientation='v',
    marker=dict(color=['green', 'purple', 'gray','gray','gray'])),
    row=1, col=2
)

fig.add_trace(go.Bar(
    x=tj2022['job_title'],
    y=tj2022['count'],
    orientation='v',
    marker=dict(color=['purple', 'green', 'gray','gray','gray'])),
    row=2, col=1
)

fig.add_trace(go.Bar(
    x=tj2023['job_title'],
    y=tj2023['count'],
    orientation='v',
    marker=dict(color=['purple', 'green', 'gray','gray','gray'])),
    row=2, col=2,
)

fig.update_layout(showlegend=False)

In [18]:
fig = go.Figure()

fig.add_trace(go.Line(
    x=top_jobs_per_year['work_year'],
    y=top_jobs_per_year['count']
))


plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [22]:

fig = go.Figure()

category_order = ['Entry Level', 'Intermediate Level', 'Senior Level', 'Executive Level']
colors = {'S': 'gray', 'M': 'blue', 'L': 'gray'}

for company_size, color in colors.items():
    filtered_data = experience_df[experience_df['company_size'] == company_size]
    fig.add_trace(go.Bar(
        x=filtered_data['experience_level'],
        y=filtered_data['salary_in_usd'],
        name=company_size,
        marker=dict(color=color),


In [None]:


# Filter the data for the top 5 countries
top_countries = top_jobs_df['company_location'].value_counts().head(5).index
filtered_data = top_jobs_df[top_jobs_df['company_location'].isin(top_countries)]

# Calculate the global average salary
global_avg_salary = top_jobs_df['salary_in_usd'].mean()

# Create a bar chart to compare average salary per country
fig = go.Figure()

for country in top_countries:
    country_data = filtered_data[filtered_data['company_location'] == country]
    avg_salary = country_data['salary_in_usd'].mean()
    
    fig.add_trace(go.Bar(
        x=[country],
        y=[avg_salary],
        name=country,
        marker=dict(color='blue')
    ))

fig.add_trace(go.Scatter(
    x=top_countries,
    y=[global_avg_salary] * len(top_countries),
    name='Global Average',
    mode='lines',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Average Salary by Country (Top 5)',
    xaxis_title='Country',
    yaxis_title='Salary (USD)'
)

fig.show()


In [None]:


# Filter the data for the top 5 countries
top_countries = top_jobs_df['job_title'].value_counts().head(5).index
filtered_data = top_jobs_df[top_jobs_df['job_title'].isin(top_countries)]

# Calculate the global average salary
global_avg_salary = top_jobs_df['salary_in_usd'].mean()

# Create a bar chart to compare average salary per country
fig = go.Figure()

for country in top_countries:
    country_data = filtered_data[filtered_data['job_title'] == country]
    avg_salary = country_data['salary_in_usd'].mean()
    
    fig.add_trace(go.Bar(
        x=[country],
        y=[avg_salary],
        name=country,
        marker=dict(color='blue')
    ))

fig.add_trace(go.Scatter(
    x=top_countries,
    y=[global_avg_salary] * len(top_countries),
    name='Global Average',
    mode='lines',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Average Salary by Country (Top 5)',
    xaxis_title='Country',
    yaxis_title='Salary (USD)'
)

fig.show()


In [None]:
top_countries = top_jobs_df['company_location'].value_counts().head(5).index

# Filter the data for the top 5 countries
filtered_data = top_jobs_df[top_jobs_df['company_location'].isin(top_countries)]

# Calculate the global average salary
global_avg_salary = top_jobs_df['salary_in_usd'].mean()

# Create a bar chart to compare average salary per job title within the top 5 countries
fig = go.Figure()

for country in top_countries:
    country_data = filtered_data[filtered_data['company_location'] == country]
    
    # Calculate the average salary per job title
    avg_salary_per_title = country_data.groupby('job_title')['salary_in_usd'].mean()
    
    fig.add_trace(go.Bar(
        x=avg_salary_per_title.index,
        y=avg_salary_per_title,
        name=country
    ))

fig.add_trace(go.Scatter(
    x=top_jobs_df['job_title'].unique(),
    y=[global_avg_salary] * len(top_jobs_df['job_title'].unique()),
    name='Global Average',
    mode='lines',
    line=dict(color='red', dash='dash')
))

fig.update_layout(
    title='Average Salary by Job Title (Top 5 Countries)',
    xaxis_title='Job Title',
    yaxis_title='Salary (USD)'
)

fig.show()


In [None]:
top_jobs_df

In [None]:
import plotly.graph_objects as go
fig = go.Figure(go.Scatter(x=[0, 1], y=[10, 6], mode='lines+markers+text', 
                           text=['start', 'end'], textposition=['middle left', 'middle right']))
fig.add_shape(type='line', x0=0, x1=0, y0=0, y1=1, xref='x', yref='paper')
fig.add_shape(type='line', x0=1, x1=1, y0=0, y1=1, xref='x', yref='paper')
fig.show()

In [None]:
df_2020 = top_jobs_df[top_jobs_df['work_year'] == '2020'].nlargest(5, 'salary_in_usd')
df_2023 = top_jobs_df[top_jobs_df['work_year'] == '2023'].nlargest(5, 'salary_in_usd')


# Create the trace for 2020
trace_2020 = go.Scatter(
    y=df_2020['job_title'],
    x=df_2020['salary_in_usd'],
    name='2020',
    mode='lines+markers',
    line=dict(color='blue'),
    marker=dict(color='blue')
)

# Create the trace for 2023
trace_2023 = go.Scatter(
    y=df_2023['job_title'],
    x=df_2023['salary_in_usd'],
    name='2023',
    mode='lines+markers',
    line=dict(color='red'),
    marker=dict(color='red')
)

# Create the slope graph
fig = go.Figure(data=[trace_2020, trace_2023])

fig.update_layout(
    title='Top Jobs: Salary Comparison (2020 vs 2023)',
    xaxis_title='Job Title',
    yaxis_title='Salary (USD)',
    showlegend=True,
    legend=dict(x=0.1, y=0.9),
    plot_bgcolor='rgba(0,0,0,0)'
)

fig.show()


In [None]:
df_2023 = top_jobs_df.groupby(['job_title', 'work_year']).agg({'salary_in_usd':'mean'}).reset_index()
df_2023_filtered = df_2023[df_2023['work_year'] == '2023']
df_2020_filtered = df_2023[df_2023['work_year'] == '2020']


fig = go.Figure(go.Scatter(
    y=df_2023_filtered['salary_in_usd'],
    x=df_2023_filtered['work_year'],
    text=df_2023_filtered['job_title'],
    mode='markers'
))
fig.add_trace(go.Scatter(
    y=df_2020_filtered['salary_in_usd'],
    x=df_2020_filtered['work_year'],
    text=df_2020_filtered['job_title'],
    mode='markers'
))

fig.add_trace(go.Scatter(
    y=[df_2023_filtered['salary_in_usd'].values[-1], df_2020_filtered['salary_in_usd'].values[0]],
    x=[df_2023_filtered['work_year'].values[-1], df_2020_filtered['work_year'].values[0]],
    mode='lines',
    name='Connect',
))

fig.show()

In [None]:
df_2023