# Técnicas que serão utilizadas
* RandomForest
* Divisão de dados em treino e teste
* Feature selection
* Ánalise de overfiting ou underfiting
* Pipeline
* Otimização de hyperparametros 
* Transformação de dados categoricos em númericos

# Entendendo os dados

In [17]:
# Importando as bibliotecas que serão utilizadas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [4]:
# Carregado base de dados
df = pd.read_csv('ds_salaries.csv')

In [6]:
# Entendendo a base de dados
df.shape

(3755, 11)

In [7]:
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [10]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [11]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [12]:
df.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


# Organizando os dados

Separando algumas possibilidades de features e o target.

In [13]:
# Divisão de dados em features e target
feat1 = df[['experience_level','employment_type','company_size','remote_ratio']]
feat2 = df[['experience_level','employment_type','company_size','remote_ratio','job_title']]
feat3 = df[['experience_level','employment_type','company_size','remote_ratio','job_title','company_location']]
target = df['salary_in_usd']

Transformando os dados de categoricos para númericos

In [14]:
#Transformação de dados categoricos para númericos
encoder = LabelEncoder()
categorical_features = list(feat3.columns)
for each in categorical_features:
    feat3[each] = encoder.fit_transform(feat3[each])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  feat3[each] = encoder.fit_transform(feat3[each])


In [15]:
# Divisão de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(feat3,target,test_size = 0.2,random_state = 2)

In [18]:
# Utilizando u algoritmo Random Forest para caso de regressão
# Utilizando random_state 1 para manter o padrão em casos futuros

modelo = RandomForestRegressor(random_state=1)
modelo.fit(X_train,y_train)

RandomForestRegressor(random_state=1)

In [22]:
# Fazendo predição dos dados de teste
y_pred = modelo.predict(X_test)

In [26]:
# Avaliando desempenho do modelo
print(r2_score(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))

0.39008302762134006
36840.275736301286


In [25]:
# Verificação da influencia de cada feature
modelo.feature_importances_

array([0.16667647, 0.01191437, 0.05332094, 0.0560385 , 0.28584028,
       0.42620945])