In [352]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn import datasets

In [353]:
df = pd.read_csv('ds_salaries.csv')

In [354]:
df.nunique()

work_year                4
experience_level         4
employment_type          4
job_title               93
salary                 815
salary_currency         20
salary_in_usd         1035
employee_residence      78
remote_ratio             3
company_location        72
company_size             3
dtype: int64

In [355]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [356]:
df_transposed = df.transpose()
duplicates = df_transposed.index[df_transposed.duplicated()].nunique()
duplicates

0

In [357]:
# Show the head
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [358]:
# Inflation rates
us_inflation_rates = {2019: 0.0181, 2020: 0.0123, 2021: 0.0470, 2022: 0.065}
global_inflation_rates = {2019: 0.0219, 2020: 0.0192, 2021: 0.0350, 2022: 0.088}

# Function to adjust salary
def adjust_salary(row):
    year = row['work_year']
    original_salary = row['salary_in_usd']
    currency = row['salary_currency']

    if year == 2023:
        return original_salary

    adjusted_salary = original_salary
    for y in range(year, 2023):
        if currency == 'USD':
            inflation_rate = us_inflation_rates[y]
        else:
            inflation_rate = global_inflation_rates[y]

        adjusted_salary *= (1 + inflation_rate)

    return adjusted_salary

# Apply the function to the dataset
df['adjusted_salary'] = df.apply(adjust_salary, axis=1)

#------------
#credit : @rrrrrrita
#------------

In [359]:
# Calculate quantiles for salary bin edges
quantiles = [0, 1/7, 2/7, 3/7, 4/7, 5/7, 6/7, 1]
bin_edges = [df['adjusted_salary'].quantile(q) for q in quantiles]

# Convert the continuous salary variable into 7 discrete bins based on quantiles
salary_labels = ['low', 'low-mid', 'mid', 'mid-high', 'high', 'very-high', 'Top']
df['salary_range'] = pd.cut(df['adjusted_salary'], bins=bin_edges, labels=salary_labels, include_lowest=True)

In [360]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,adjusted_salary,salary_range
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L,85847.0,low-mid
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S,30000.0,low
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S,25500.0,low
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M,175000.0,very-high
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M,120000.0,mid


In [361]:
df.drop(['salary_currency','salary_in_usd', 'adjusted_salary', 'salary'], axis=1, inplace=True)

In [362]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,employee_residence,remote_ratio,company_location,company_size,salary_range
0,2023,SE,FT,Principal Data Scientist,ES,100,ES,L,low-mid
1,2023,MI,CT,ML Engineer,US,100,US,S,low
2,2023,MI,CT,ML Engineer,US,100,US,S,low
3,2023,SE,FT,Data Scientist,CA,100,CA,M,very-high
4,2023,SE,FT,Data Scientist,CA,100,CA,M,mid


In [363]:
# Define the X and y
X = df.iloc[:, :8]
y = df.iloc[:, -1]

In [364]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [365]:
# Setup preprocessing for numeric columns
Imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()

In [381]:
Imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(handle_unknown='ignore')

In [382]:
# Select comlumns by data type
num_cols = make_column_selector(dtype_include='number')
con_cols = make_column_selector(dtype_exclude='number')

In [383]:
# do all preprocessing
preprocessor = make_column_transformer((make_pipeline(Imp_median, scaler), num_cols),
                                       (make_pipeline(Imp_constant, ohe), con_cols))

In [384]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=99)

In [391]:
# Define the models
models = [
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
]

# Model training, evaluation, and selection
best_model = None
best_score = -np.inf

print("Model performance:")
for name, model in models:
    pipe = make_pipeline(preprocessor, model)
    pipe.fit(X_train, y_train)
    
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    #cross validate the pipeline
    crosss_validation = cross_val_score(pipe, X, y).mean()
    
    print(f"{name} - Accuracy: {accuracy:.2f} - Cross Validation: {crosss_validation}")
    
    if accuracy > best_score:
        best_score = accuracy
        best_model = name

print(f"Best model: {best_model} with accuracy: {best_score:.2f}, and Cross Validation: {crosss_validation}")

Model performance:
Logistic Regression - Accuracy: 0.33 - Cross Validation: 0.31957390146471376
Random Forest - Accuracy: 0.32 - Cross Validation: 0.2817576564580559
Gradient Boosting - Accuracy: 0.30 - Cross Validation: 0.30972037283621834
Best model: Logistic Regression with accuracy: 0.33
