In [119]:
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

In [120]:
data = pd.read_csv('clean_data.csv', sep=',')
df = data.copy()
df = df.drop(['Unnamed: 0', 'Date de publication', 'lieu'], axis=1)
df['Type de contrat'].fillna('cdi', inplace=True)
df = df.dropna(subset='salaire_minimum')
df.shape

(43, 6)

In [121]:
# Target
y_max, y_min = df['salaire_maximum'], df['salaire_minimum']

In [122]:
# Features
X_cat = df.select_dtypes(include=[object])
X_cat.columns

Index(['Intitulé du poste', 'competences', 'Nom de la société',
       'Type de contrat'],
      dtype='object')

In [123]:
# Pipeline

model = RandomForestRegressor()
params = {
    "random_state" : [n for n in range(0, 100)],
    "criterion" : ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
}

pipe_cat = Pipeline(
    steps=[
        ('pipe_imp', SimpleImputer(strategy='most_frequent')),
        ('pipe_enc', OneHotEncoder(sparse=False))
    ]
)
tf_cat = ColumnTransformer(
    transformers=[
        ('tf_cat', pipe_cat, ['Intitulé du poste', 'Nom de la société', 'Type de contrat']),
        ('tf_comp', CountVectorizer(), 'competences')
    ]
)

final_pipe = Pipeline(
    steps=[
        ('transformation', tf_cat),
        ('model', GridSearchCV(model, params, cv=2, refit=True))
    ]
)


In [124]:
sorted(model.get_params().keys())

['bootstrap',
 'ccp_alpha',
 'criterion',
 'max_depth',
 'max_features',
 'max_leaf_nodes',
 'max_samples',
 'min_impurity_decrease',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'n_estimators',
 'n_jobs',
 'oob_score',
 'random_state',
 'verbose',
 'warm_start']

In [125]:
y_max, y_min = df['salaire_maximum'], df['salaire_minimum']
X = X_cat
X_train, X_test, y_train, y_test = train_test_split(X, y_max, test_size=0.25, random_state = 10)

In [126]:
set_config(display='diagram')
final_pipe

In [127]:
# Evaluation du score de prédiction pour le salaire maximum
final_pipe.fit(X_train, y_train)
y_max_pred = final_pipe.predict(X_test)
print("RFR:", round(r2_score(y_test, y_max_pred), 5))
print("best params : ", final_pipe['model'].best_params_)

RFR: 0.72169
best params :  {'criterion': 'absolute_error', 'random_state': 37}


In [128]:
# Evaluation du score de prédiction pour le salaire minimum
final_pipe.fit(X_train, y_train)
y_min_pred = final_pipe.predict(X_test)
print("RFR:", round(r2_score(y_test, y_max_pred), 5))
print("best params : ", final_pipe['model'].best_params_)

RFR: 0.72169
best params :  {'criterion': 'absolute_error', 'random_state': 37}
