### 🖋 **Notebook Contents**

0. Initial Setup
1. Modelling
2. Conclusion
3. Recommendation

****

## `Initial Setup`

In [8]:
# Data Manipulation
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import missingno as msno

import warnings
warnings.filterwarnings("ignore")

# Model Algorithm (modeling)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn import linear_model
import statsmodels.api as sm

# Data Preparation
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV, StratifiedKFold, KFold
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
import category_encoders as ce
from sklearn.compose import TransformedTargetRegressor

# Evaluation metrics
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, mean_squared_log_error, r2_score

# pycaret
import statistics

In [2]:
data = pd.read_csv('..\data\processed\salaries_clean.csv')
data

Unnamed: 0,work_year,experience_level,employment_type,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,job_position,job_scope
0,2023,SE,FT,132000,US,100,US,M,STAFF,DATA ENGINEER
1,2023,MI,FT,81206,GB,0,GB,M,STAFF,ML/AI ENGINEER
2,2023,EX,FT,330000,US,0,US,M,HEAD,ML/AI ENGINEER
3,2023,EX,FT,188000,US,0,US,M,HEAD,ML/AI ENGINEER
4,2023,MI,FT,140000,US,0,US,M,STAFF,BUSINESS INTELLIGENCE
...,...,...,...,...,...,...,...,...,...,...
4245,2020,SE,FT,412000,US,100,US,L,STAFF,DATA SCIENTIST
4246,2021,MI,FT,151000,US,100,US,L,STAFF,DATA SCIENTIST
4247,2020,EN,FT,105000,US,100,US,S,STAFF,DATA SCIENTIST
4248,2020,EN,CT,100000,US,100,US,L,STAFF,DATA ANALYST


## `Modelling`

### Splitting

In [5]:
# divide feature and target

target = 'salary_in_usd'
feature = data.drop(columns=target).columns

display(target, feature)

'salary_in_usd'

Index(['work_year', 'experience_level', 'employment_type',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size', 'job_position', 'job_scope'],
      dtype='object')

In [6]:
# data spliting

train, test = train_test_split(
    data,
    test_size = 0.20,
    random_state = 7
)

print(train.shape, test.shape)

(3400, 10) (850, 10)


### Encoding

In [7]:
cat = data.select_dtypes(object).columns

# check initial info of data
pd.DataFrame({
    'column': data[cat].columns.values,
    'type': data[cat].dtypes.values,
    'n_unique': data[cat].nunique().values,
    'min': data[cat].min().values,
    'max': data[cat].max().values,
    'sample_unique': [data[col].sort_values().unique() for col in data[cat].columns]
})

Unnamed: 0,column,type,n_unique,min,max,sample_unique
0,experience_level,object,4,EN,SE,"[EN, EX, MI, SE]"
1,employment_type,object,4,CT,PT,"[CT, FL, FT, PT]"
2,employee_residence,object,84,AD,ZA,"[AD, AE, AM, AR, AS, AT, AU, BA, BE, BG, BO, B..."
3,company_location,object,72,AD,ZA,"[AD, AE, AM, AR, AS, AT, AU, BA, BE, BR, BS, C..."
4,company_size,object,3,L,S,"[L, M, S]"
5,job_position,object,5,DIRECTOR,STAFF,"[DIRECTOR, HEAD, LEAD, MANAGER, STAFF]"
6,job_scope,object,9,ANALYTICS ENGINEER,RESEARCH/APPLIED SCIENTIST,"[ANALYTICS ENGINEER, BUSINESS INTELLIGENCE, DA..."


In [15]:
# divide encode handling
ohe = ['employment_type','job_scope']
oren = ['experience_level', 'company_size', 'job_position']
tgen = ['employee_residence', 'company_location']

display(ohe,oren,tgen)

['employment_type', 'job_scope']

['experience_level', 'company_size', 'job_position']

['employee_residence', 'company_location']

In [21]:
orenMap = [{'col': 'experience_level', 'mapping': {'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}}, 
     {'col': 'company_size', 'mapping': {'S': 0, 'M': 1, 'L': 2}},
     {'col': 'job_position', 'mapping': {'STAFF': 0, 'LEAD': 1, 'MANAGER': 2, 'HEAD': 3, 'DIRECTOR': 4}}]

pd.DataFrame(orenMap)

Unnamed: 0,col,mapping
0,experience_level,"{'EN': 0, 'MI': 1, 'SE': 2, 'EX': 3}"
1,company_size,"{'S': 0, 'M': 1, 'L': 2}"
2,job_position,"{'STAFF': 0, 'LEAD': 1, 'MANAGER': 2, 'HEAD': ..."


In [22]:
# Create encoder
ct = ColumnTransformer([
    ('One Hot Encoder', OneHotEncoder(drop='first'), ohe),
    ('Ordinal Encoder', ce.OrdinalEncoder(cols=oren, mapping=orenMap), oren),
    ('Target Encoder', ce.TargetEncoder(), tgen)
], remainder='passthrough')