# 📜 Projeto Final - Capacitação IA (Ciclo 2)
# 🎓 Aluno: Filipe da Silva Rodrigues

## 💻 Bibliotecas Necessárias

In [9]:
# Instalação de bibliotecas necessárias para execução do código
%pip install numpy pandas scikit-learn mlflow xgboost lightgbm --quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Tratamento de Dataset e Métricas
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error

# Modelos de Treinamento
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
# Multi-layer Perceptron (MLP)
from sklearn.neural_network import MLPRegressor
# Support Vector Machine
from sklearn.svm import SVR
# Random Forest, Bagging e Gradient Boosting
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor
# XGBoost
from xgboost import  XGBRegressor
# LightGBM
from lightgbm import LGBMRegressor

# Armazenamento e Análise de Modelos
import mlflow
import mlflow.sklearn

# Terminal
import warnings
from IPython.display import clear_output
warnings.filterwarnings("ignore")


---

👾 **Dataset de Regressão - Hugging Face: Einstellung/demo-salaries**

Esse dataframe é um conjunto de dados que contém informações sobre salários e características de diferentes cargos na área de ciência de dados. As variáveis são:

- `work_year`: o ano em que o salário foi reportado (ex: 2023).
- `experience_level`: o nível de experiência do funcionário (EN = Júnior, MI = Pleno, SE = Sênior, EX = Executivo).
- `employment_type`: o tipo de emprego (PT = Meio período, FT = Tempo integral, CT = Contrato, FL = Freelance).
- `job_title`: o título do cargo do funcionário (ex: Data Scientist, Data Engineer).
- `salary`: o salário anual bruto reportado.
- `salary_currency`: a moeda na qual o salário foi pago (ex: USD, EUR).
- `salary_in_usd`: o salário anual bruto convertido para USD.
- `employee_residence`: o país de residência do funcionário (ex: US, CA, GB).
- `remote_ratio`: a proporção de trabalho remoto (0 = Presencial, 50 = Híbrido, 100 = Totalmente remoto).
- `company_location`: o país onde a empresa está localizada.
- `company_size`: o tamanho da empresa (S = Pequena, M = Média, L = Grande).

✅ **Objetivo:** Prever qual salário anual em USD de um funcionário de acordo com as características coletadas.

---


In [2]:
# Carregar o dataset
url = 'https://huggingface.co/datasets/Einstellung/demo-salaries/resolve/main/ds_salaries.csv'
dataset = pd.read_csv(url)

# Analisar o dataset
print('\nInformações do Dataset:\n')
display(dataset.info())

print('\nVerificar Valores Nulos:\n')
display(dataset.isnull().sum())

print('\nVerificar Valores Únicos em Features Categóricas:\n')
for col in dataset.select_dtypes(include=['object']).columns:
    print(f'{col}: {dataset[col].nunique()} unique values')

# Exibir o dataset original
print('\nDataset Original:\n')
display(dataset)

# Criar uma cópia do dataset para efetuar os devidos tratamentos
df = dataset.copy()

# Normalizando os dados das features na escala (0..1)
columns_to_normalize = ['salary', 'remote_ratio', 'work_year']
df[columns_to_normalize] = MinMaxScaler().fit_transform(df[columns_to_normalize])

# Separar os dados para o tratamento de features categóricas
target = df['salary_in_usd'].copy()
features = df.drop('salary_in_usd', axis=1).copy()

# Convertendo features categóricas para números com OneHotEncoder
categorical_columns = ['experience_level', 'employment_type', 'job_title', 'salary_currency',
                       'employee_residence', 'company_location', 'company_size']

column_transform = make_column_transformer(
    (OneHotEncoder(drop='first'), categorical_columns), remainder='passthrough')

# Transformando os dados
features_transformed = column_transform.fit_transform(features)
columns_names = column_transform.get_feature_names_out()

# Transformando o resultado em um DataFrame
features_transformed_df = pd.DataFrame(
    data=features_transformed.toarray(), columns=columns_names)

# Dicionário para mapear as colunas a serem renomeadas
rename_mapping = {col: col.replace('onehotencoder__', '').replace('remainder__', '') 
                  for col in features_transformed_df.columns}

# Renomeando as colunas
features_transformed_df.rename(columns=rename_mapping, inplace=True)

# Combinando as features transformadas com o target
df = pd.concat(
    [features_transformed_df, target.reset_index(drop=True)], axis=1)

# Exibindo o DataFrame tratado com as colunas renomeadas
print('\nDataset Tratado para Treinamento:\n')
display(df)

# Análise de correlação entre as features
print('\nMatriz de Correlação:\n')
correlation_matrix = df.corr()
display(correlation_matrix)

# Separando os dados 
y = df['salary_in_usd']  # Coluna 'salary_in_usd'
x = df.drop('salary_in_usd', axis=1)  # Todas as outras colunas

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=None)

# Aplicar Recursive Filter Elimination (RFE) para Seleção de Features
num_features = 50  # Número de features a serem selecionadas
estimator = DecisionTreeRegressor()
# estimator = LogisticRegression(max_iter=1000)
# estimator = Lasso(alpha=0.01, max_iter=1000)
selector = RFE(estimator, n_features_to_select=num_features)

# Fit e Transform no Conjunto de Treinamento
x_train_selected = selector.fit_transform(x_train, y_train)

# Transform no Conjunto de Teste
x_test_selected = selector.transform(x_test)

display(x_train_selected.shape, x_test_selected.shape)


Informações do Dataset:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


None


Verificar Valores Nulos:



work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64


Verificar Valores Únicos em Features Categóricas:

experience_level: 4 unique values
employment_type: 4 unique values
job_title: 93 unique values
salary_currency: 20 unique values
employee_residence: 78 unique values
company_location: 72 unique values
company_size: 3 unique values

Dataset Original:



Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M
...,...,...,...,...,...,...,...,...,...,...,...
3750,2020,SE,FT,Data Scientist,412000,USD,412000,US,100,US,L
3751,2021,MI,FT,Principal Data Scientist,151000,USD,151000,US,100,US,L
3752,2020,EN,FT,Data Scientist,105000,USD,105000,US,100,US,S
3753,2020,EN,CT,Business Data Analyst,100000,USD,100000,US,100,US,L



Dataset Tratado para Treinamento:



Unnamed: 0,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_FL,employment_type_FT,employment_type_PT,job_title_AI Developer,job_title_AI Programmer,job_title_AI Scientist,job_title_Analytics Engineer,...,company_location_TR,company_location_UA,company_location_US,company_location_VN,company_size_M,company_size_S,work_year,salary,remote_ratio,salary_in_usd
0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.000000,0.002435,1.0,85847
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.000000,0.000790,1.0,30000
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.000000,0.000642,1.0,25500
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.000000,0.005560,1.0,175000
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.000000,0.003751,1.0,120000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3750,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.013358,1.0,412000
3751,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.333333,0.004771,1.0,151000
3752,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.000000,0.003257,1.0,105000
3753,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.003093,1.0,100000



Matriz de Correlação:



Unnamed: 0,experience_level_EX,experience_level_MI,experience_level_SE,employment_type_FL,employment_type_FT,employment_type_PT,job_title_AI Developer,job_title_AI Programmer,job_title_AI Scientist,job_title_Analytics Engineer,...,company_location_TR,company_location_UA,company_location_US,company_location_VN,company_size_M,company_size_S,work_year,salary,remote_ratio,salary_in_usd
experience_level_EX,1.000000,-0.092433,-0.252152,-0.009144,0.001938,-0.011933,-0.009591,-0.004085,0.012254,0.046308,...,-0.006461,-0.005778,0.022562,-0.002888,-0.003061,0.012020,0.003156,0.014783,0.007190,0.160986
experience_level_MI,-0.092433,1.000000,-0.744400,0.035964,-0.033295,-0.006230,-0.004301,-0.012059,0.015640,-0.036080,...,0.052106,-0.017059,-0.255712,-0.008526,-0.097174,0.060936,-0.128381,0.044729,-0.000650,-0.273791
experience_level_SE,-0.252152,-0.744400,1.000000,-0.040667,0.113486,-0.096100,-0.045802,-0.032896,-0.067133,0.041563,...,-0.036503,0.005553,0.324686,-0.023258,0.236746,-0.163489,0.194923,-0.043809,-0.035201,0.349900
employment_type_FL,-0.009144,0.035964,-0.040667,1.000000,-0.517998,-0.003485,-0.002801,-0.001193,-0.003380,-0.008678,...,-0.001887,0.156722,-0.053906,-0.000843,-0.047840,0.095761,-0.050350,0.007545,0.025238,-0.070292
employment_type_FT,0.001938,-0.033295,0.113486,-0.517998,1.000000,-0.676019,0.005407,0.002303,-0.076258,0.000246,...,0.003643,-0.079394,0.082093,0.001628,0.125424,-0.173783,0.116310,0.006731,-0.068702,0.118263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
company_size_S,0.012020,0.060936,-0.163489,0.095761,-0.173783,0.108664,0.064994,-0.004676,0.091809,-0.025639,...,-0.007397,0.035342,-0.229439,-0.003306,-0.463577,1.000000,-0.257948,0.027367,0.108512,-0.190663
work_year,0.003156,-0.128381,0.194923,-0.050350,0.116310,-0.093825,0.027726,0.004219,-0.064922,0.017725,...,-0.051424,0.005969,0.267002,0.014787,0.421975,-0.257948,1.000000,-0.094724,-0.236430,0.228290
salary,0.014783,0.044729,-0.043809,0.007545,0.006731,-0.011125,-0.004319,-0.004664,0.008242,-0.009838,...,-0.004137,-0.006460,-0.101413,-0.004343,-0.136249,0.027367,-0.094724,1.000000,0.028731,-0.023676
remote_ratio,0.007190,-0.000650,-0.035201,0.025238,-0.068702,0.041919,-0.016126,0.001772,0.034475,0.027991,...,-0.004714,-0.005896,-0.077706,-0.015545,-0.154550,0.108512,-0.236430,0.028731,1.000000,-0.064171


(3004, 50)

(751, 50)

## 🧪 Experimentos no MLFLOW

In [3]:
# Definir os modelos e suas variações de parâmetros
models = {
    "DecisionTree": [
        {"criterion": "squared_error", "max_depth": 10, "min_samples_split": 4},
        {"criterion": "squared_error", "max_depth": 20, "min_samples_split": 10},
        {"criterion": "friedman_mse", "max_depth": 15, "min_samples_split": 5},
    ],
    "SVR": [
        {"C": 1.0, "kernel": "linear", "epsilon": 0.1},
        {"C": 10.0, "kernel": "rbf", "epsilon": 0.01},
        {"C": 100.0, "kernel": "poly", "degree": 3, "epsilon": 0.001},
    ],
    "MLPRegressor": [
        {"hidden_layer_sizes": (100, 50), "activation": "relu",
         "solver": "adam", "max_iter": 1000},
        {"hidden_layer_sizes": (50, 50, 50), "activation": "tanh",
         "solver": "adam", "max_iter": 1000},
        {"hidden_layer_sizes": (100, 50), "activation": "relu",
         "solver": "lbfgs", "max_iter": 1000},
    ],
    "Bagging": [
        {},
    ],
    "RandomForest": [
        {},
    ],
    "GradientBoosting": [
        {},
    ],
    "XGBoost": [
        {},
    ],
    "LightGBM": [
        {},
    ]
}


# Mapeamento de nomes de modelos para classes
model_classes = {
    "DecisionTree": DecisionTreeRegressor,
    "SVR": SVR,
    "MLPRegressor": MLPRegressor,
    "Bagging": BaggingRegressor,
    "RandomForest": RandomForestRegressor,
    "GradientBoosting": GradientBoostingRegressor,
    "XGBoost": XGBRegressor,
    "LightGBM": LGBMRegressor,
}

In [4]:
# Preparar o ambiente do MLFlow e início do experimento

# lista para armazenar os resultados
results = []

# Iniciar o experimento
mlflow.set_experiment("exp_projeto_ciclo_2")

# Contador para evitar conflitos de nomes
counter = 0

# Run principal
with mlflow.start_run(run_name="Projeto Final Ciclo 2") as main_run: 
    for model_name, param_variations in models.items():
        for params in param_variations:
            counter += 1
            # Run aninhada
            with mlflow.start_run(run_name=f"{counter}. {model_name}", nested=True):
                # Instanciar o modelo usando o dicionário de classes
                model = model_classes[model_name](**params)

                # Realizar validação cruzada para 10 folds e calcular as previsões
                predictions = cross_val_predict(
                    model, x_train_selected, y_train, cv=10)

                # Calcular as métricas
                rmse = np.sqrt(mean_squared_error(y_train, predictions))
                mae = mean_absolute_error(y_train, predictions)
                mape = np.mean(np.abs((y_train - predictions) / y_train)) * 100

                # Registrar os parâmetros
                mlflow.log_param("model_name", model_name)

                # Registrar parâmetros individualmente
                for key, value in params.items():
                    mlflow.log_param(key, str(value))

                mlflow.log_metric("RMSE", rmse)
                mlflow.log_metric("MAE", mae)
                mlflow.log_metric("MAPE", mape)
                mlflow.sklearn.log_model(model, f"{model_name}")

                # Armazenar resultados
                results.append({"model": model_name, "params": params,
                               "RMSE": rmse, "MAE": mae, "MAPE": mape})

    # Selecionar os 3 melhores modelos com base na métrica MAPE
    best_models = sorted(results, key=lambda x: x["MAPE"])[:3]

    # Limpar a saída do terminal
    clear_output(wait=True)

    # Exibir os melhores modelos e suas métricas
    print("Melhores Modelos:")
    for model_info in best_models:
        print(model_info)

Melhores Modelos:
{'model': 'XGBoost', 'params': {}, 'RMSE': 9693.681797900808, 'MAE': 1536.957633962009, 'MAPE': 3.3519985108268604}
{'model': 'DecisionTree', 'params': {'criterion': 'squared_error', 'max_depth': 20, 'min_samples_split': 10}, 'RMSE': 11133.620117851986, 'MAE': 1557.6362824435148, 'MAPE': 4.2063453437292875}
{'model': 'RandomForest', 'params': {}, 'RMSE': 9961.10780497899, 'MAE': 1354.4603362183755, 'MAPE': 4.21842383334923}


## 💾 Modelos Registrados no MLFLOW

In [9]:
import subprocess

# Definir o tracking URI do MLfloww
mlflow_tracking_uri = 'http://localhost:5000'
mlflow.set_tracking_uri(mlflow_tracking_uri)

# Iniciar o MLflow UI em um subprocesso separado
mlflow_process = subprocess.Popen(["mlflow", "ui"])

# Exibir a URL do MLflow UI
print(f"MLflow UI está rodando em {mlflow_tracking_uri}")

MLflow UI está rodando em http://localhost:5000


In [10]:
# Parar o subprocesso do MLflow UI
mlflow_process.terminate()

# Confirmar que o MLflow UI foi parado
print("MLflow UI foi parado")

MLflow UI foi parado
