In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

# Carregar os dados
file_path = 'emprego.csv'
data = pd.read_csv(file_path)

# Exibir as primeiras linhas do conjunto de dados
data.head()


In [None]:

# Análise de correlação apenas das colunas numéricas
num_cols = data.select_dtypes(include=['float64', 'int64']).columns
corr = data[num_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.show()


In [None]:

# Separar as features do target
X = data.drop(columns=['sl_no', 'salary', 'status'])
y = data['salary']

# Preencher os valores NaN na coluna de salário com a mediana
y.fillna(y.median(), inplace=True)

# Normalizar a variável de destino (y)
scaler_y = StandardScaler()
y = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Identificar colunas numéricas e categóricas
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Criar transformers para colunas numéricas e categóricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Criar um pré-processador que aplica os transformers adequados
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])

# Dividir os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplicar a transformação aos dados
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Garantir que não há valores NaN em y_train e y_test
y_train = np.nan_to_num(y_train)
y_test = np.nan_to_num(y_test)

# Verificar os dados após a normalização
print("Primeiros 5 registros de X_train após normalização:")
print(X_train[:5])
print("Estatísticas descritivas de X_train:")
print(pd.DataFrame(X_train).describe())

# Verificar outliers extremos nas colunas numéricas originais
plt.figure(figsize=(12, 8))
sns.boxplot(data=data[num_cols])
plt.show()


In [None]:

# Regressão Linear
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# Random Forest com ajuste de hiperparâmetros
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# Rede Neural Artificial com parâmetros ajustados e normalização de X
nn_model = MLPRegressor(
    hidden_layer_sizes=(5,),
    activation='relu',
    solver='adam',
    learning_rate='constant',
    learning_rate_init=0.0001,
    alpha=0.1,
    max_iter=1000,
    shuffle=True,
    validation_fraction=0.2,
    random_state=20,
    verbose=True
)
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)
y_pred_nn = scaler_y.inverse_transform(y_pred_nn.reshape(-1, 1)).flatten()  # Inverter normalização
mse_nn = mean_squared_error(y_test, y_pred_nn)
r2_nn = r2_score(y_test, y_pred_nn)

# Exibir resultados
print("Regressão Linear - MSE:", mse_lr, "R^2:", r2_lr)
print("Random Forest - MSE:", mse_rf, "R^2:", r2_rf)
print("Rede Neural - MSE:", mse_nn, "R^2:", r2_nn)
