# Paquetes y Funciones

In [1]:
import nbimporter
from funciones import *

# Avisos
import warnings
warnings.filterwarnings("ignore")

# 1. Carga datos:

### Rutas

In [2]:
ruta_actual = os.getcwd()

ruta_datos_procesados = ruta_actual.replace("Modelo_predictivo", "Datos/Procesados/")
ruta_listas = ruta_actual + "/listas/"
ruta_encoders = ruta_actual + "/encoders/"
ruta_modelos = ruta_actual + "/trained_models/"

In [3]:
ruta_encoders

'/home/evelazco/bootcamp/IT_Job_Spain_Project/Modelo_predictivo/encoders/'

### Datos empleos

In [4]:
df_empleos = pd.read_csv(ruta_datos_procesados + 'datos_jobs_finales.csv')

In [5]:
df_empleos["categoria_empleo"] = df_empleos["categoria_empleo"].apply(lambda x: "machine learning engineer" if x == "machine learning" else x)

## -----------------------------------------------------------------------------------------------------------------------------

# 2. Prepocesamiento de datos

In [6]:
rutas = [ruta_listas, ruta_encoders]
encoders = [OneHotEncoder(), MultiLabelBinarizer()]

In [7]:
df_salario_min, df_salario_max = data_preparator(df_empleos, rutas, encoders)

## -----------------------------------------------------------------------------------------------------------------------------

# 3. Modelos de regresión

## 3.1. Preparación de datos

- ### Salario mínimo

In [8]:
X_min = df_salario_min.drop(["salario_min"], axis= 1)
y_min = df_salario_min[["salario_min"]]

- ### Salario máximo

In [9]:
X_max = df_salario_max.drop(["salario_max"], axis= 1)
y_max = df_salario_max[["salario_max"]]

## -----------------------------------------------------------------------------------------------------------------------------

## 3.2. Testeo de modelos

In [10]:
modelos = [LinearRegression(), RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(), SVR(), KNeighborsRegressor(), DecisionTreeRegressor(), Ridge(), Lasso(), ElasticNet(), XGBRegressor(), LinearSVR()]

- ### Salario mínimo

In [11]:
MIN_SIN_PCA = model_tester(modelos, X_min, y_min)

KeyboardInterrupt: 

In [None]:
MIN_SIN_PCA[MIN_SIN_PCA["mean_r2"] > 0.3].sort_values(by= "mean_r2", ascending= False)

- ### Salario máximo

In [None]:
MAX_SIN_PCA = model_tester(modelos, X_max, y_max)

In [None]:
MAX_SIN_PCA[MAX_SIN_PCA["mean_r2"] > 0.3].sort_values(by= "mean_r2", ascending= False)

## -----------------------------------------------------------------------------------------------------------------------------

### 3.2.1. TUNING SVR y GBR

In [None]:
params_SVR = {'kernel'      : ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree'      : [3, 4],
              'gamma'       : ['scale', 'auto'],
              'coef0'       : [-0.5, -0.4, -0.3],
              'C'           : [0.1, 0.2, 0.3],
              'epsilon'     : [0.01, 0.02],
              'shrinking'   : [True],
              'tol'         : [1e-6, 1e-5, 1e-7],
              'cache_size'  : [50, 100, 150],
              'verbose'     : [False],
              'max_iter'    : [550, 600, 650],
             }

params_GBR = {'loss'             : ["squared_error", "absolute_error", "huber", "quantile"],
              'n_estimators'     : [50, 100, 150],
              'learning_rate'    : [0.001, 0.01, 0.1],
              'max_depth'        : [3, 5],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf' : [1, 2, 4],
              'subsample'        : [0.5, 0.7, 1.0],
              'max_features'     : [None, 'sqrt', 'log2']
              }

modelos = [SVR(), GradientBoostingRegressor()]
parametros = [params_SVR, params_GBR]

In [None]:
X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_min, y_min, test_size = 0.2, random_state=42)

resultados_min = []

for modelo, params in zip(modelos, parametros):
    resultado = tunning(modelo, params, X_train_min, X_test_min, y_train_min, y_test_min, "min")
    resultados_min.append(resultado)
    
resultados_min = pd.concat([resultados_min[0], resultados_min[1]], axis= 0)
resultados_min.to_csv("resultados_tuning_min.csv", index= False, sep= ",")

In [None]:
resultados_min

In [None]:
resultados_min[resultados_min["Nombre"] == "SVR"]["Parametros"].values

In [None]:
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max, y_max, test_size = 0.2, random_state=42)

resultados_max = []

for modelo, params in zip(modelos, parametros):
    resultado = tunning(modelo, params, X_train_max, X_test_max, y_train_max, y_test_max, "max")
    resultados_max.append(resultado)
    
resultados_max = pd.concat([resultados_max[0], resultados_max[1]], axis= 0)
resultados_max.to_csv("resultados_tuning_max.csv", index= False, sep= ",")

In [None]:
resultados_max

In [None]:
resultados_max[resultados_max["Nombre"] == "SVR"]["Parametros"].values

## -----------------------------------------------------------------------------------------------------------------------------

### 3.2.2. Best SVR models

In [None]:
modelo_SVR = SVR()
params_SVR_min = {'kernel': ['rbf'],
                  'degree': [3],
                  'gamma': ['scale'],
                  'coef0': [-0.5],
                  'C': [0.2],
                  'epsilon': [i/100 for i in range(1,10)],
                  'shrinking': [True],
                  'tol': [1e-6],
                  'cache_size': [200],
                  'verbose': [False],
                  'max_iter': [i for i in range(500,710,10)],
                 }
    
params_SVR_max = {'kernel': ['rbf'],
                  'degree': [3],
                  'gamma': ['scale'],
                  'coef0': [-0.5],
                  'C': [0.2],
                  'epsilon': [i/100 for i in range(1,10)],
                  'shrinking': [True],
                  'tol': [1e-6],
                  'cache_size': [200],
                  'verbose': [False],
                  'max_iter': [i for i in range(500,710,10)],
                 }

X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_min, y_min, test_size = 0.2, random_state=42)
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max, y_max, test_size = 0.2, random_state=42)

In [None]:
df_resultados_tuning_min = tunning(modelo_SVR, params_SVR_min, X_train_min, X_test_min, y_train_min, y_test_min, "min", save= True)
df_resultados_tuning_max = tunning(modelo_SVR, params_SVR_max, X_train_max, X_test_max, y_train_max, y_test_max, "max", save= True)

In [None]:
df_resultados_tuning_min

In [None]:
df_resultados_tuning_min["Parametros"].values

In [None]:
df_resultados_tuning_max

In [None]:
df_resultados_tuning_max["Parametros"].values

## -----------------------------------------------------------------------------------------------------------------------------

## 3.3. PCA

Abrir los mejores modelos guardados y probar con PCA

In [None]:
modelo_svr_min = load('min_model.pkl')
modelo_svr_max = load('max_model.pkl')

In [None]:
salarios = ["min", "max"]
modelos_svr = [modelo_svr_min, modelo_svr_max]
X = [X_min, X_max]
y = [y_min, y_max]

In [None]:
df_resultados_PCA = pca_tester(salarios, modelos_svr, X, y)

In [None]:
df_resultados_PCA[df_resultados_PCA["salario"] == "min"].sort_values(by= "R2", ascending= False).head(1)

In [None]:
df_resultados_PCA[df_resultados_PCA["salario"] == "max"].sort_values(by= "R2", ascending= False).head(1)

### 3.3.1. TUNING SVR con PCA

In [None]:
pca_min = PCA(128, random_state=42)
pca_max = PCA(128, random_state=42)


X_min_pca = pca_min.fit_transform(X_min)
#with open("pca_min.pickle", 'wb') as archivo:
    #pickle.dump(pca_min, archivo)
        
X_max_pca = pca_max.fit_transform(X_max)
#with open("pca_max.pickle", 'wb') as archivo:
    #pickle.dump(pca_max, archivo)        

In [None]:
modelo_SVR = SVR()
params_SVR_min = {'kernel'      : ['linear', 'poly', 'rbf', 'sigmoid'],
                  'degree'      : [3, 4],
                  'gamma'       : ['scale', 'auto'],
                  'coef0'       : [-0.5, -0.4, -0.3],
                  'C'           : [0.1, 0.2, 0.3],
                  'epsilon'     : [i/100 for i in range(1,10)],
                  'shrinking'   : [True],
                  'tol'         : [1e-6, 1e-5, 1e-7],
                  'cache_size'  : [200],
                  'verbose'     : [False],
                  'max_iter'    : [550, 600, 650],
                 }
    
params_SVR_max = {'kernel'      : ['linear', 'poly', 'rbf', 'sigmoid'],
                  'degree'      : [3, 4],
                  'gamma'       : ['scale', 'auto'],
                  'coef0'       : [-0.5, -0.4, -0.3],
                  'C'           : [0.1, 0.2, 0.3],
                  'epsilon'     : [i/100 for i in range(1,10)],
                  'shrinking'   : [True],
                  'tol'         : [1e-6, 1e-5, 1e-7],
                  'cache_size'  : [200],
                  'verbose'     : [False],
                  'max_iter'    : [550, 600, 650],
                 }

X_train_min, X_test_min, y_train_min, y_test_min = train_test_split(X_min_pca, y_min, test_size = 0.2, random_state=42)
X_train_max, X_test_max, y_train_max, y_test_max = train_test_split(X_max_pca, y_max, test_size = 0.2, random_state=42)

In [None]:
df_resultados_tuning_min = tunning(modelo_SVR, params_SVR_min, X_train_min, X_test_min, y_train_min, y_test_min, "min")
df_resultados_tuning_max = tunning(modelo_SVR, params_SVR_max, X_train_max, X_test_max, y_train_max, y_test_max, "max")

In [None]:
df_resultados_tuning_min

In [None]:
df_resultados_tuning_min["Parametros"].values

In [None]:
df_resultados_tuning_max

In [None]:
df_resultados_tuning_max["Parametros"].values

## -----------------------------------------------------------------------------------------------------------------------------

In [None]:
modelo_SVR = SVR()
params_SVR_min = {'kernel': ['rbf'],
                  'degree': [3],
                  'gamma': ['scale'],
                  'coef0': [-0.5],
                  'C': [0.2],
                  'epsilon': [i/100 for i in range(1,10)],
                  'shrinking': [True],
                  'tol': [1e-6],
                  'cache_size': [200],
                  'verbose': [False],
                  'max_iter': [i for i in range(500,710,10)],
                 }
    
params_SVR_max = {'kernel': ['rbf'],
                  'degree': [3],
                  'gamma': ['scale'],
                  'coef0': [-0.5],
                  'C': [0.2],
                  'epsilon': [i/100 for i in range(1,10)],
                  'shrinking': [True],
                  'tol': [1e-6],
                  'cache_size': [200],
                  'verbose': [False],
                  'max_iter': [i for i in range(500,710,10)],
                 }

df_resultados_tuning_min = tunning(modelo_SVR, params_SVR_min, X_train_min, X_test_min, y_train_min, y_test_min, "min", save= True)
df_resultados_tuning_max = tunning(modelo_SVR, params_SVR_max, X_train_max, X_test_max, y_train_max, y_test_max, "max", save= True)
df_resultados = pd.concat([df_resultados_tuning_min, df_resultados_tuning_max], axis= 0)

In [None]:
df_resultados

## -----------------------------------------------------------------------------------------------------------------------------

# 4. Testing modelos con nuevos datos de entrada:

## Pipeline procesamiento de datos de entrada

    - Hago encoding columnas
    - Encoding herramientas
    - Transformación log
    - Limpio columnas   

- ### Datos de entrada:

In [None]:
herramientas = ["python", "sql", "ia", "machine learning"]
jornada = "jornada completa"
experiencia = 1
tipo_contrato = "indefinido"
beneficios = False
comunidad = "Galicia"
categoria_empleo = "data driven"

X_datos = {"herramientas": [herramientas],"jornada": [jornada],"experiencia": [experiencia],"tipo_contrato": [tipo_contrato],"beneficios": [beneficios],"comunidad": [comunidad],"categoria_empleo": [categoria_empleo]}

X = pd.DataFrame(X_datos)

In [None]:
X

- ### Transformo los datos de entrada:

In [None]:
X_testeo_min, X_testeo_max = data_transformer(X, ruta_encoders, ruta_modelos)

In [None]:
# Cargo modelos:
modelo_svr_min = load(ruta_modelos + 'min_model.pkl')
modelo_svr_max = load(ruta_modelos + 'max_model.pkl')

## - Predicción salario:

In [None]:
salario_minimo_predicho = np.exp(modelo_svr_min.predict(X_testeo_min))
salario_maximo_predicho = np.exp(modelo_svr_max.predict(X_testeo_max))
print(f"El rango salarial con estas características es de {round(int(salario_minimo_predicho), -2)} a {round(int(salario_maximo_predicho), -2)} € brutos anuales.")

## -----------------------------------------------------------------------------------------------------------------------------

## -----------------------------------------------------------------------------------------------------------------------------

# 4. Redes Neuronales

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import RNN, Dense, LSTM, Embedding, Input
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import BatchNormalization
from keras.layers import Dense, Flatten, Dropout, Activation
from keras.callbacks import ReduceLROnPlateau

In [None]:
# Escalado:
scaler = StandardScaler()
X_min_scaled = scaler.fit_transform(X_min)

In [None]:
X_min = X_min.values.astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_min, y_min, test_size = 0.2, random_state=42)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state=42)

In [None]:
# Model definition
np.random.seed(42)
model = Sequential()

model.add(Dense(units = 200, input_shape= (X_train.shape[1],), activation="linear"))  # Adjust input shape
model.add(Activation("relu"))
model.add(Dropout(0.2))

model.add(Dense(units = 128))
model.add(Activation("relu"))
model.add(Dropout(0.2))

model.add(Dense(units = 64))
model.add(Activation("relu"))
model.add(Dropout(0.2))

# Hidden layers (optional, experiment with number and neurons)
##model.add(Dense(128, activation="relu"))
#model.add(Dense(64, activation="relu"))
#model.add(Dense(32, activation="relu"))

model.add(Dense(1, activation="linear"))  # Output layer for regression

# Model compilation
model.compile(loss="mse", optimizer=Adam(learning_rate=0.001))  # Adjust loss and optimizer as needed

#model.summary()

In [None]:
# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor="val_loss", patience= 5)

# Model training
history = model.fit(X_train,
                    y_train.values,
                    validation_data=(X_val, y_val.values),
                    epochs= 100,
                    #batch_size= 100,
                    callbacks=[early_stopping],
                    verbose = 1
)

# Model evaluation on test set
model.evaluate(X_test, y_test.values)

In [None]:
# loss
plt.plot(history.history["loss"], label = "loss")
plt.legend()
plt.show()

In [None]:
y_pred = model.predict(X_test)

In [None]:
r2_score_results = r2_score(y_test, y_pred)
mean_squared_error_results = mean_squared_error(y_test, y_pred)
mean_absolute_error_results = mean_absolute_error(y_test, y_pred)

In [None]:
print(r2_score_results)
print(mean_squared_error_results)
print(mean_absolute_error_results)

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential


In [None]:
X = df_salario_max.drop(["salario_max"], axis= 1)
y = df_salario_max[["salario_max"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
def create_regression_model(input_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(input_shape,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1) 
    ])

    model.compile(optimizer= "adam", loss='mean_squared_error')
    
    return model

imput_shape = X.shape[1]

regression_model = create_regression_model(imput_shape)

regression_model.summary()

In [None]:
history = regression_model.fit(X_train.values.astype(int), y_train.values, epochs= 100)

In [None]:
# loss
plt.plot(history.history["loss"], label = "loss")
plt.legend()
plt.show()

In [None]:
y_pred = regression_model.predict(X_test.values.astype(int))

In [None]:
r2_score_results = r2_score(y_test, y_pred)
mean_squared_error_results = mean_squared_error(y_test, y_pred)
mean_absolute_error_results = mean_absolute_error(y_test, y_pred)

In [None]:
print(r2_score_results)
print(mean_squared_error_results)
print(mean_absolute_error_results)

In [None]:
# Para graficar:

modelo_gbr = GradientBoostingRegressor()
df_salario_max.columns = df_salario_max.columns.astype(str)
modelo_gbr.fit(X_train, y_train)
y_pred = modelo_gbr.predict(X_test)

In [None]:
test_y = y_test["salario_max"].values.flatten()

In [None]:
pred_y = y_pred.flatten()

In [None]:
sns.scatterplot(x= test_y, y= pred_y)