In [60]:
import numpy as np
import pandas as pd
import plotly_express as px
from plotly import graph_objects as go
from scipy.stats import ttest_ind
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st

## 1. Clean Data

In [61]:
data = pd.read_csv("data_scientist.csv")
data.job_title.unique()
data.experience_level.unique()
data.company_location.unique()

array(['ES', 'US', 'CA', 'DE', 'GB', 'NG', 'IN', 'HK', 'NL', 'CH', 'CF',
       'FR', 'FI', 'UA', 'IE', 'IL', 'GH', 'CO', 'SG', 'AU', 'SE', 'SI',
       'MX', 'BR', 'PT', 'RU', 'TH', 'HR', 'VN', 'EE', 'AM', 'BA', 'KE',
       'GR', 'MK', 'LV', 'RO', 'PK', 'IT', 'MA', 'PL', 'AL', 'AR', 'LT',
       'AS', 'CR', 'IR', 'BS', 'HU', 'AT', 'SK', 'CZ', 'TR', 'PR', 'DK',
       'BO', 'PH', 'BE', 'ID', 'EG', 'AE', 'LU', 'MY', 'HN', 'JP', 'DZ',
       'IQ', 'CN', 'NZ', 'CL', 'MD', 'MT'], dtype=object)

In [62]:
# Will be used to map the jobs to their category
job_dict = {
    'Data Scientist': [
        'Data Scientist', 'Principal Data Scientist', 'Applied Scientist', 'AI Developer',
        'Research Scientist', 'Head of Data', 'Data Science Manager', 'AI Programmer',  
        'Director of Data Science', 'Machine Learning Scientist', 'Applied Machine Learning Scientist',
        'Lead Data Scientist', 'Deep Learning Researcher', 'Data Science Consultant', 
        'Machine Learning Developer', '3D Computer Vision Researcher', 'Machine Learning Researcher',
        'Data Science Tech Lead', 'Data Scientist Lead', 'Product Data Scientist', 'Data Science Lead',
        'Machine Learning Manager', 'AI Scientist', 'Head of Data Science', 'Applied Data Scientist',
        'Head of Machine Learning'
    ],
    'Data Analyst': [
        'Data Analyst', 'Data Quality Analyst', 'Compliance Data Analyst', 'Business Data Analyst',
        'Lead Data Analyst', 'Marketing Data Analyst', 'Data Analytics Specialist', 'Insight Analyst',
        'Product Data Analyst', 'BI Data Analyst', 'Data Operations Analyst', 'Data Analytics Lead',
        'Principal Data Analyst', 'Financial Data Analyst', 'BI Analyst', 'Data Analytics Manager',
        'Data Analytics Consultant', 'Data Manager', 'Manager Data Management'
    ],
    'Data Engineer': [
        'Data Engineer', 'Data Modeler', 'Analytics Engineer', 'Business Intelligence Engineer',
        'Data Strategist', 'Data DevOps Engineer', 'Big Data Engineer', 'Data Specialist',
        'BI Data Engineer', 'Data Infrastructure Engineer', 'Cloud Database Engineer', 'ETL Engineer',
        'Data Operations Engineer', 'BI Developer', 'Azure Data Engineer', 'Computer Vision Engineer',
        'Machine Learning Infrastructure Engineer', 'Cloud Data Engineer', 'ETL Developer',
        'Data Architect', 'Big Data Architect', 'Autonomous Vehicle Technician', 'ML Engineer',
        'Machine Learning Software Engineer', 'Data Analytics Engineer', 'Research Engineer',
        'Computer Vision Software Engineer', 'Data Lead', 'Data Management Specialist',
        'Applied Machine Learning Engineer', 'MLOps Engineer', 'Machine Learning Research Engineer',
        'Deep Learning Engineer', 'Machine Learning Engineer', 'Data Science Engineer',
        'Lead Machine Learning Engineer', 'NLP Engineer', 'Principal Machine Learning Engineer',
        'Software Data Engineer', 'Principal Data Architect', 'Lead Data Engineer'
    ]
}

def map_job(job):
    for k, v_list in job_dict.items():
        if job in v_list:
            return k
    return job

In [63]:
def process(df):
    df = df.copy()
    
    # Filter year and employment type (FT - Full time)
    df = df[df.work_year.isin([2022, 2023])]
    df = df[df.employment_type == 'FT']
    
    # Map job title to job category
    df['job_category'] = df.job_title.map(map_job)
    
    # Filter company location
    country_counts = df.company_location.value_counts()
    idx = (country_counts > 10).values
    countries = country_counts[idx].index
    df = df[df.company_location.isin(countries)]
    df = df.reset_index(drop=True)
    
    # Rename experience levels
    entry_lvl_map = {'EN': 'Entry', 'MI': 'Middle',
                     'SE': 'Senior', 'EX': 'Expert'}
    df.experience_level.replace(entry_lvl_map, inplace=True)
    return df

df = process(data)
df.head()
df['company_location'].unique()

array(['ES', 'CA', 'US', 'DE', 'GB', 'IN', 'FR', 'AU', 'BR', 'PT', 'GR'],
      dtype=object)

In [64]:
# Calculate medians
job_cats = list(job_dict.keys())
salary_meds = {}
for job_cat in job_cats:
    salary_meds[job_cat] = df[df.job_category == job_cat].salary_in_usd.median()

# Color map for job categories
job_cat_cmap = {'Data Scientist':'#9cd3f7', 'Data Analyst':'#fc88a7', 'Data Engineer':'#80c779'}

# Create the histograms
fig = px.histogram(df, x='salary_in_usd', nbins=50, color='job_category', marginal='rug',
                   color_discrete_map=job_cat_cmap,
                   labels = {'salary_in_usd': 'Yearly salary in USD', 'job_category': 'Job category'}, opacity=0.5)
fig.update_layout(bargap=0.1, title = "Yearly salary distribution in US dollars<br><sup>2022/2023</sup>")

# Plot the median vertical bars
for k, v in salary_meds.items():
    fig.add_vline(x=v, line_width=6, line_color='black')
    fig.add_vline(x=v, line_width=3, line_color=job_cat_cmap[k])

fig.show()


In [65]:
# Function to create a Plotly figure
def create_plotly_figure():
    x = np.linspace(0, 10, 100)
    y = np.sin(x)
    fig = px.line(x=x, y=y, labels={'x':'X-axis', 'y':'Y-axis'}, title='Sine Wave')
    return fig

In [66]:
fig = go.Figure()

# Get the df for a single job category each time
for job in job_cats:
    
    # Prepare the temporary dataframe (filter and sort)
    temp_df = df[df.job_category == job]
    dfs = []
    sort_order = ['Entry', 'Middle', 'Senior', 'Expert']
    for x in sort_order:
        dfs.append(temp_df[temp_df.experience_level == x])
    temp_df = pd.concat(dfs)
    
    # Add box plots
    fig.add_trace(go.Box(
        y=temp_df.salary_in_usd,
        x=temp_df.experience_level,
        name=job, marker_color=job_cat_cmap[job]))

fig.update_layout(
    title="Annual salary distribution in US dollars by experience level and job category<br><sup>2022/2023</sup>",
    xaxis_title="Experience level",
    yaxis_title="Annual salary (USD)",
    boxmode='group')

fig.show()

In [67]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_category
0,2023,Senior,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L,Data Scientist
1,2023,Senior,FT,Data Scientist,175000,USD,175000,CA,100,CA,M,Data Scientist
2,2023,Senior,FT,Data Scientist,120000,USD,120000,CA,100,CA,M,Data Scientist
3,2023,Senior,FT,Applied Scientist,222200,USD,222200,US,0,US,L,Data Scientist
4,2023,Senior,FT,Applied Scientist,136000,USD,136000,US,0,US,L,Data Scientist


In [68]:
df =  df[["company_location", "experience_level", "job_category", "salary_in_usd"]]
df.head()

Unnamed: 0,company_location,experience_level,job_category,salary_in_usd
0,ES,Senior,Data Scientist,85847
1,CA,Senior,Data Scientist,175000
2,CA,Senior,Data Scientist,120000
3,US,Senior,Data Scientist,222200
4,US,Senior,Data Scientist,136000


In [69]:
# Check is there a lot of null
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3322 entries, 0 to 3321
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   company_location  3322 non-null   object
 1   experience_level  3322 non-null   object
 2   job_category      3322 non-null   object
 3   salary_in_usd     3322 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 103.9+ KB


In [70]:
df['company_location'].unique()

array(['ES', 'CA', 'US', 'DE', 'GB', 'IN', 'FR', 'AU', 'BR', 'PT', 'GR'],
      dtype=object)

In [71]:
# Change string to binary: using Label Encoder
from sklearn.preprocessing import LabelEncoder
le_company_location = LabelEncoder()
df['company_location'] = le_company_location.fit_transform(df['company_location'])
df['company_location'].unique()

array([ 4,  2, 10,  3,  6,  8,  5,  0,  1,  9,  7])

In [72]:
le_experience_level = LabelEncoder()
df['experience_level'] = le_experience_level.fit_transform(df['experience_level'])
df["experience_level"].unique()

array([3, 0, 2, 1])

In [73]:
le_job_category = LabelEncoder()
df["job_category"] = le_job_category.fit_transform(df['job_category'])
df["job_category"].unique()

array([2, 0, 1])

In [74]:
X = df.drop("salary_in_usd", axis=1)
y = df["salary_in_usd"]

## B. Try different model

In [75]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X, y.values)
y_pred = linear_reg.predict(X)
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
error = np.sqrt(mean_squared_error(y, y_pred))
error

53108.01063923262

In [76]:
from sklearn.tree import DecisionTreeRegressor
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(X, y.values)
y_pred = dec_tree_reg.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$48,153.49


In [77]:
X

Unnamed: 0,company_location,experience_level,job_category
0,4,3,2
1,2,3,2
2,2,3,2
3,10,3,2
4,10,3,2
...,...,...,...
3317,10,0,0
3318,10,0,1
3319,3,3,1
3320,10,0,1


In [78]:
y.values

array([ 85847, 175000, 120000, ...,  84053, 125000,  21013])

In [79]:
from sklearn.ensemble import RandomForestRegressor
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(X, y.values)
y_pred = random_forest_reg.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))


$48,172.02


In [80]:
from sklearn.model_selection import GridSearchCV

max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth": max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(X, y.values)
regressor = gs.best_estimator_

regressor.fit(X, y.values)
y_pred = regressor.predict(X)
error = np.sqrt(mean_squared_error(y, y_pred))
print("${:,.02f}".format(error))

$48,310.84


In [81]:
# company_location, experience_level, job_category
X = np.array([["CA", "Senior", "Data Scientist"]])
X

array([['CA', 'Senior', 'Data Scientist']], dtype='<U14')

In [82]:
X[:, 0] = le_company_location.transform(X[:,0])
X[:, 1] = le_experience_level.transform(X[:,1])
X[:, 2] = le_job_category.transform(X[:,2])
X = X.astype(float)
X

array([[2., 3., 2.]])

In [83]:
y_pred = regressor.predict(X)
y_pred


X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names



array([172118.3])

## 3. Export Model

In [84]:
import pickle

In [85]:
data = {"model": regressor, "le_company_location": le_company_location, "le_experience_level": le_experience_level, "le_job_category": le_job_category}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [86]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

regressor_loaded = data["model"]
le_company_location = data["le_company_location"]
le_experience_level = data["le_experience_level"]
le_job_category = data["le_job_category"]

In [87]:
y_pred = regressor_loaded.predict(X)
y_pred


X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names



array([172118.3])