In [244]:
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [245]:
data = pd.read_csv('clean_data.csv', sep=',')
df = data.copy()
df = df.drop(['Unnamed: 0', 'Date de publication', 'lieu'], axis=1)
df['Type de contrat'].fillna('cdi', inplace=True)
df = df.dropna(subset='salaire_minimum')
df.shape

(43, 6)

In [246]:
# Target
y_max, y_min = df['salaire_maximum'], df['salaire_minimum']

In [247]:
# Features
X_cat = df.select_dtypes(include=[object])
X_cat.columns

Index(['Intitulé du poste', 'competences', 'Nom de la société',
       'Type de contrat'],
      dtype='object')

In [248]:
# Pipeline
pipe_cat = Pipeline(
    steps=[
        ('pipe_imp', SimpleImputer(strategy='most_frequent')),
        ('pipe_enc', OneHotEncoder(sparse=False))
    ]
)
tf_cat = ColumnTransformer(
    transformers=[
        ('tf_cat', pipe_cat, ['Intitulé du poste', 'Nom de la société', 'Type de contrat']),
        ('tf_comp', CountVectorizer(), 'competences')
    ]
)

final_pipe = Pipeline(
    steps=[
        ('transformation', tf_cat),
        ('model', RandomForestRegressor())
    ]
)

In [249]:
y_max, y_min = df['salaire_maximum'], df['salaire_minimum']
X = X_cat
X_train, X_test, y_train, y_test = train_test_split(X, y_max, test_size=0.2, random_state=10)

In [250]:
set_config(display='diagram')
final_pipe

In [251]:
# Evaluation du score de prédiction pour le salaire maximum
final_pipe.fit(X_train, y_train)
y_max_pred = final_pipe.predict(X_test)
print("RFR:", round(r2_score(y_test, y_max_pred), 5))

RFR: 0.8282


In [252]:
# Evaluation du score de prédiction pour le salaire minimum
final_pipe.fit(X_train, y_train)
y_min_pred = final_pipe.predict(X_test)
print("RFR:", round(r2_score(y_test, y_max_pred), 5))

RFR: 0.8282
