In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import pycountry
import plotly.graph_objects as go

In [2]:
df = pd.read_csv('ds_salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [4]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [5]:
numerical_columns = df.select_dtypes(include=['int64', 'float64'])
categorical_columns = df.select_dtypes(include=['object'])

display(numerical_columns)
display(categorical_columns)

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
0,2023,80000,85847,100
1,2023,30000,30000,100
2,2023,25500,25500,100
3,2023,175000,175000,100
4,2023,120000,120000,100
...,...,...,...,...
3750,2020,412000,412000,100
3751,2021,151000,151000,100
3752,2020,105000,105000,100
3753,2020,100000,100000,100


Unnamed: 0,experience_level,employment_type,job_title,salary_currency,employee_residence,company_location,company_size
0,SE,FT,Principal Data Scientist,EUR,ES,ES,L
1,MI,CT,ML Engineer,USD,US,US,S
2,MI,CT,ML Engineer,USD,US,US,S
3,SE,FT,Data Scientist,USD,CA,CA,M
4,SE,FT,Data Scientist,USD,CA,CA,M
...,...,...,...,...,...,...,...
3750,SE,FT,Data Scientist,USD,US,US,L
3751,MI,FT,Principal Data Scientist,USD,US,US,L
3752,EN,FT,Data Scientist,USD,US,US,S
3753,EN,CT,Business Data Analyst,USD,US,US,L


In [6]:
df['work_year'] = df['work_year'].apply(str)
#df['remote_ratio'] = df['remote_ratio'].apply(str)

df['experience_level'].replace({
    'SE':'Senior Level',
    'MI':'Intermediate Level',
    'EN':'Entry Level',
    'EX':'Executive Level',
}, inplace=True)

df['employment_type'].replace({
    'FT':'Full-Time',
    'CT':'Contract',
    'PT':'Part-Time',
    'FL':'Freelance',
}, inplace=True);

df.drop(columns=['salary', 'salary_currency'], inplace=True)

# Create a dictionary mapping ISO-2 to ISO-3 country codes
iso2_to_iso3 = {country.alpha_2: country.alpha_3 for country in pycountry.countries}

# Map ISO-2 country codes to ISO-3 country codes in the DataFrame
df['company_location'] = df['company_location'].map(iso2_to_iso3)
df['employee_residence'] = df['employee_residence'].map(iso2_to_iso3)

df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,Senior Level,Full-Time,Principal Data Scientist,85847,ESP,100,ESP,L
1,2023,Intermediate Level,Contract,ML Engineer,30000,USA,100,USA,S
2,2023,Intermediate Level,Contract,ML Engineer,25500,USA,100,USA,S
3,2023,Senior Level,Full-Time,Data Scientist,175000,CAN,100,CAN,M
4,2023,Senior Level,Full-Time,Data Scientist,120000,CAN,100,CAN,M
...,...,...,...,...,...,...,...,...,...
3750,2020,Senior Level,Full-Time,Data Scientist,412000,USA,100,USA,L
3751,2021,Intermediate Level,Full-Time,Principal Data Scientist,151000,USA,100,USA,L
3752,2020,Entry Level,Full-Time,Data Scientist,105000,USA,100,USA,S
3753,2020,Entry Level,Contract,Business Data Analyst,100000,USA,100,USA,L


In [7]:
df['job_title'].describe()

count              3755
unique               93
top       Data Engineer
freq               1040
Name: job_title, dtype: object

There are 3755 job entries and 93 different job titles

In [8]:
job_titles = df['job_title'].value_counts() 
filtered_job_titles = job_titles[job_titles >30]
filtered_job_titles = filtered_job_titles.reset_index().rename(columns={'job_title': 'count', 'index': 'job_title', })

In [9]:
top_jobs_df = df[df['job_title'].isin(filtered_job_titles['job_title'])]
top_jobs_df

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
1,2023,Intermediate Level,Contract,ML Engineer,30000,USA,100,USA,S
2,2023,Intermediate Level,Contract,ML Engineer,25500,USA,100,USA,S
3,2023,Senior Level,Full-Time,Data Scientist,175000,CAN,100,CAN,M
4,2023,Senior Level,Full-Time,Data Scientist,120000,CAN,100,CAN,M
5,2023,Senior Level,Full-Time,Applied Scientist,222200,USA,0,USA,L
...,...,...,...,...,...,...,...,...,...
3746,2021,Intermediate Level,Full-Time,Data Scientist,119059,SGP,100,ISR,M
3748,2021,Intermediate Level,Full-Time,Data Engineer,28369,MLT,50,MLT,L
3750,2020,Senior Level,Full-Time,Data Scientist,412000,USA,100,USA,L
3752,2020,Entry Level,Full-Time,Data Scientist,105000,USA,100,USA,S


## Lets Get Started with the Top 5 Job Titles Around the World

In [10]:
top_jobs = top_jobs_df['job_title'].value_counts().sort_values(ascending=True)
top_jobs = top_jobs[5:]

fig = go.Figure(go.Bar(
    y=top_jobs.index,
    x=top_jobs.values,
    orientation='h'
))
fig.update_layout(
    title='From 2020 to 2023 the most demanded job is Data Engineer',
    xaxis_title='Job Listings',
    margin=dict(l=200, r=400, t=50, b=50)
)

fig.show()

## Changes in Salaries over the years

In [11]:
salaries_years = top_jobs_df.groupby(['job_title', 'work_year']).agg({'salary_in_usd':'mean'}).reset_index()
salaries_years_filtered = salaries_years[salaries_years['job_title'].isin(top_jobs.index)]
salaries_years_filtered

Unnamed: 0,job_title,work_year,salary_in_usd
0,Analytics Engineer,2022,137969.807018
1,Analytics Engineer,2023,170210.652174
4,Data Analyst,2020,42705.0
5,Data Analyst,2021,75024.952381
6,Data Analyst,2022,107207.398551
7,Data Analyst,2023,114097.47557
8,Data Architect,2021,166666.666667
9,Data Architect,2022,166091.543478
10,Data Architect,2023,157555.384615
11,Data Engineer,2020,75726.933333


In [12]:
salaries_years = top_jobs_df.groupby(['job_title', 'work_year']).agg({'salary_in_usd':'mean'}).reset_index()
salaries_years_filtered = salaries_years[salaries_years['job_title'].isin(top_jobs.index)]

# Create a separate trace for each job title
traces = []
job_titles = salaries_years_filtered['job_title'].unique()

for title in job_titles:
    data = salaries_years_filtered[salaries_years_filtered['job_title'] == title]
    trace = go.Scatter(
        x=salaries_years_filtered['work_year'],
        y=salaries_years_filtered['salary_in_usd'],
        mode='lines',
        name=title
    )
    traces.append(trace)

# Create the area graph
fig = go.Figure(data=traces)

# Customize the graph layout
fig.update_layout(
    title='Trend of Average Salaries by Job Title',
    xaxis_title='Year',
    yaxis_title='Average Salary',
    showlegend=True,
    hovermode='x',
    template='plotly_dark'
)

# Show the graph
fig.show()

In [13]:
avg_salary_by_location = top_jobs_df.groupby('company_location')['salary_in_usd'].mean().reset_index()

color_scale = ['#003300', '#115e23', '#218c45', '#32b768', '#44df8b','#5affae', '#7cfecb', '#9ffce9', '#c1fff5', '#e3ffff']
# Create a choropleth map
fig = go.Figure(data=go.Choropleth(
    locations=avg_salary_by_location['company_location'],
    z=avg_salary_by_location['salary_in_usd'],
    locationmode='ISO-3',
    colorscale=color_scale,
    colorbar_title='Average Salary'
))

# Customize the map layout
fig.update_layout(
    title='Average Salaries by Location',
    geo=dict(showframe=False, showcoastlines=False)
)

# Show the map
fig.show()