In [192]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import model_selection

In [193]:
data = pd.read_csv('clean_data.csv', sep=',')
df = data.copy()
df = df.drop(['Unnamed: 0', 'Date de publication', 'lieu'], axis=1)
df.shape

(43, 6)

In [194]:
# Target, cible de la prediction
y_max = df['salaire_maximum']
y_min = df['salaire_minimum']

In [195]:
# Features
X_cat = df.select_dtypes(include=[object])
X_cat.columns

Index(['Intitulé du poste', 'competences', 'Nom de la société',
       'Type de contrat'],
      dtype='object')

In [196]:
# Pipeline

RFR = RandomForestRegressor()

params_RFR = {
    "criterion" : ['absolute_error', 'friedman_mse'],
    "random_state" : [n for n in range(39, 67)],
    "n_estimators" : [n for n in range(8, 27)]
}


pipe_cat = Pipeline(
    steps=[
        ('pipe_imp', SimpleImputer(strategy='most_frequent')),
        ('pipe_enc', OneHotEncoder(sparse=False))
    ]
)
tf_cat = ColumnTransformer(
    transformers=[
        ('tf_cat', pipe_cat, ['Intitulé du poste', 'Nom de la société', 'Type de contrat']),
        ('tf_comp', CountVectorizer(), 'competences')
    ]
)

RFR_pipe_max = Pipeline(
    steps=[
        ('transformation', tf_cat),
        ('model', RandomForestRegressor(n_estimators=26, random_state=66, criterion='absolute_error'))
    ]
)

RFR_pipe_min = Pipeline(
    steps=[
        ('transformation', tf_cat),
        ('model', RandomForestRegressor(n_estimators=9, random_state=40, criterion='friedman_mse'))
    ]
)



In [197]:
sorted(RandomForestRegressor().get_params().keys())

['bootstrap',
 'ccp_alpha',
 'criterion',
 'max_depth',
 'max_features',
 'max_leaf_nodes',
 'max_samples',
 'min_impurity_decrease',
 'min_samples_leaf',
 'min_samples_split',
 'min_weight_fraction_leaf',
 'n_estimators',
 'n_jobs',
 'oob_score',
 'random_state',
 'verbose',
 'warm_start']

In [198]:
X = X_cat
X_train, X_test, y_train, y_test = train_test_split(X, y_max, test_size=0.25, random_state=10)

In [199]:
set_config(display='diagram')
RFR_pipe_max

In [200]:
# Evaluation du score de prédiction pour le salaire maximum
RFR_pipe_max.fit(X_train, y_train)
y_max_pred = RFR_pipe_max.predict(X_test)
print("RFR:", round(r2_score(y_test, y_max_pred), 5))
# print("best params : ", RFR_pipe_max['model'].best_params_)

RFR: 0.86616


In [201]:
df_px = pd.DataFrame(y_test)
df_px = df_px.reset_index().drop('index', axis=1)
df_px = pd.concat([df_px, pd.DataFrame(y_max_pred)], axis=1)
df_px = df_px.rename(columns={0 : "salaire_max_pred"})
fig = px.scatter(df_px, title="Ecart entre les valeurs testées et les valeurs prédites", labels= {"value": "salaire"})
fig.update_traces(marker={'size': 15})
fig.show()

In [202]:
X_train, X_test, y_train, y_test = train_test_split(X, y_min, test_size=0.25, random_state=10)

In [203]:
# Evaluation du score de prédiction pour le salaire minimum
RFR_pipe_min.fit(X_train, y_train)
y_min_pred = RFR_pipe_min.predict(X_test)
print("RFR:", round(r2_score(y_test, y_min_pred), 5))
# print("best params : ", RFR_pipe['model'].best_params_)

RFR: 0.83544


In [204]:
df_px = pd.DataFrame(y_test)
df_px = df_px.reset_index().drop('index', axis=1)
df_px = pd.concat([df_px, pd.DataFrame(y_min_pred)], axis=1)
df_px = df_px.rename(columns={0 : "salaire_min_pred"})

fig = px.scatter(df_px, title="Ecart entre les valeurs testées et les valeurs prédites", labels= {"value": "salaire"})
fig.update_traces(marker={'size': 15})
fig.show()