# Salary prediction using job description

## Packages importation

In [1]:
# Packages importation
import pandas as pd
import numpy as np
import os
import time
from tqdm import tqdm
import spacy

import warnings
warnings.filterwarnings('ignore')

2023-09-17 18:17:27.643369: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Packages needed for the keras model
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Flatten, BatchNormalization, Dense

In [3]:
# Packages needed for the BERT layer
import tensorflow_hub as hub
import tensorflow_text as text

In [4]:
#To check whether a GPU is used
tf.config.list_physical_devices('GPU')

[]

## Data importation

In [5]:
# Data importation: either Pole Emploi data or Jocas data
# Pole Emploi data
data_a = pd.read_csv("~/work/data_2019a_full.csv", parse_dates=[2], infer_datetime_format=True, low_memory=False)
data_b = pd.read_csv("~/work/data_2019b_full.csv", parse_dates=[2], infer_datetime_format=True, low_memory=False)

data = pd.concat([data_a, data_b], ignore_index=True)

#JOCAS data
data = pd.read_csv("~/skills/jocas_2022.csv", low_memory=False)

data.head(5)

Unnamed: 0.1,Unnamed: 0,url,date_first_seen_day,date_scraping,site_name,site_child,scraping_failure_status,id_jocas,date_first_disappeared_day,date_last_seen_day,...,partner_name,partner_status,teleworking_accepted,teleworking_type,teleworking_mentioned,experience_min,experience_max,education_level,education_field,rome_loc_firm
0,apec.2,https://www.apec.fr/cms/webservices/offre/publ...,2022-01-01,Sat Jan 1 21:21:59 2022,apec,,False,APEC_2022-01-01_2,,,...,,False,,,False,5.0,,,,True
1,apec.3,https://www.apec.fr/cms/webservices/offre/publ...,2022-01-01,Sat Jan 1 21:22:00 2022,apec,,False,APEC_2022-01-01_3,,,...,,False,,,False,5.0,,,,True
2,apec.4,https://www.apec.fr/cms/webservices/offre/publ...,2022-01-01,Sat Jan 1 21:22:01 2022,apec,,False,APEC_2022-01-01_4,,,...,,False,,,False,10.0,,,,True
3,apec.5,https://www.apec.fr/cms/webservices/offre/publ...,2022-01-01,Sat Jan 1 21:22:02 2022,apec,,False,APEC_2022-01-01_5,,,...,,False,,,False,3.0,,,,True
4,apec.6,https://www.apec.fr/cms/webservices/offre/publ...,2022-01-01,Sat Jan 1 21:22:04 2022,apec,,False,APEC_2022-01-01_6,,,...,,False,,,False,5.0,,,,True


In [73]:
# For Pole Emploi data only
# Keeping only observations with non-missing salary
df_sal = data[~data['salmin_etp'].isnull()]
print(data.shape)
print(df_sal.shape)

(420887, 51)
(319859, 51)


In [None]:
# For Jocas data only
df_sal = data[~data['salary_min'].isnull()]
print(data.shape)
print(df_sal.shape)

In [75]:
# For Pole Emploi data
# Regrouping all skill requirement in one column as a succession of sentences
df_sal = df_sal.fillna('')
df_sal['specificites'] = df_sal.filter(regex="specificites", axis=1).T.agg('.'.join)

In [74]:
# Random sample of 100,000 observations
tqdm.pandas()
df = df_sal.sample(n=100000, random_state=0).copy()

## BERT Embedding

In [7]:
# Loading the RoBERTa model from the Tensorflow Hub
bert_preprocess = hub.load("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_preprocess/1")
bert_embedding = hub.load("https://tfhub.dev/jeongukjae/xlm_roberta_multi_cased_L-12_H-768_A-12/1")

2023-09-17 18:21:57.736147: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
2023-09-17 18:21:57.869226: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'trimmer_trim_roundrobintrimmer_generate_mask_while_tile_multiples_trimmer_trim_roundrobintrimmer_generate_mask_strided_slice_0' with dtype int32
	 [[{{node trimmer_trim_roundrobintrimmer_generate_mask_while_tile_multiples_trimmer_trim_roundrobintrimmer_generate_mask_strided_slice_0}}]]
2023-09-17 18:21:57.869379: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for plac

In [8]:
# Defining the first layer of the model
french_preprocessor = hub.KerasLayer(bert_preprocess,
                                    trainable=False)
french_vectorizer = hub.KerasLayer(bert_embedding,
                                  input_shape=[],
                                 dtype=tf.string,
                                 trainable=False)

## Model architecture

In [9]:
# Model architecture : first layer non modified

input1 = Input(shape=[], dtype='string')
preprocessor_layer = french_preprocessor(input1)
vectorizer_layer = french_vectorizer(preprocessor_layer)
bert_layer = vectorizer_layer['sequence_output']
conv1 = Conv1D(filters=64, kernel_size=4, activation='relu')(bert_layer)
pool1 = GlobalMaxPooling1D()(conv1)
flat = Flatten()(pool1)
norm = BatchNormalization()(flat)
dense1 = Dense(64, activation='relu', kernel_initializer='RandomNormal')(norm)
dense2 = Dense(1, activation='relu', kernel_initializer='RandomNormal')(dense1)
french_model = Model(inputs=input1, outputs=dense2)

print(french_model.summary())

2023-09-17 18:22:52.981526: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-09-17 18:22:53.014282: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-09-17 18:22:53.123233: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_1' with dtype int32 and shape [?,128]
	 [[{{node inputs_1}}]]
2023-09-

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_mask': (Non  0           ['input_1[0][0]']                
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

2023-09-17 18:22:53.195671: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-09-17 18:22:53.195763: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_2}}]]


## Training the model

In [None]:
from sklearn.model_selection import train_test_split

In [76]:
# For Pole Emploi data
dataset = df[['kc_offre','specificites', 'salmin_etp']] #specifites (skills) or dc_descriptifoffre (job description)
dataset['salmin_etp'] = np.log(dataset['salmin_etp'])

# For Jocas data
dataset = df[['id_jocas','description_full','salary_min']]
dataset['salary_min'] = np.log(dataset['salary_min'])

In [77]:
# Create the X train and test samples
test_size=0.2
X = dataset['specificites']

x_train, x_test = train_test_split(X, test_size=test_size, random_state=0)

In [78]:
# Create the y trian and test
test_size=0.2
y = dataset['salmin_etp']

y_train, y_test = train_test_split(y, test_size=test_size, random_state=0)

In [79]:
# Training the model

# To save step by step while training
checkpoint = tf.keras.callbacks.ModelCheckpoint("french_model", monitor="val_loss", mode='min', save_best_only=True)
# The optimizer is Adam and the metric is the RMSE
french_model.compile(loss='mean_squared_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])
french_model.fit(x_train, y_train, batch_size=16, epochs=4, validation_data=(x_test, y_test), callbacks=[checkpoint])

Epoch 1/4


2023-07-13 14:13:08.056330: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/roberta_pack_inputs/StatefulPartitionedCall/Trimmer/Trim/RoundRobinTrimmer/generate_mask/RaggedConcat/assert_equal_1/Assert/AssertGuard/branch_executed/_8591




2023-07-13 15:58:30.715364: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/roberta_pack_inputs/StatefulPartitionedCall/Trimmer/Trim/RoundRobinTrimmer/generate_mask/RaggedConcat/assert_equal_1/Assert/AssertGuard/branch_executed/_1055
2023-07-13 16:23:53.679033: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_2' with dtype string and shape [?]
	 [[{{node input_2}}]]
2023-07-13 16:23:53.919383: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' wit

INFO:tensorflow:Assets written to: french_model/assets


INFO:tensorflow:Assets written to: french_model/assets


Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7faa25c63050>

## Out-of-sample performance metrics for the Jocas data

In [27]:
# Out-of-sample prediction to evaluate performance
y_preds = french_model.predict(x_test)
y_result=pd.DataFrame()
y_result['ln_salary']=y_test
y_result['predictions']=y_preds
y_result.head(10)

2023-09-18 07:51:16.932022: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/roberta_pack_inputs/StatefulPartitionedCall/Trimmer/Trim/RoundRobinTrimmer/generate_mask/RaggedConcat/assert_equal_1/Assert/AssertGuard/branch_executed/_1505




Unnamed: 0,ln_salary,predictions
1135513,10,9.893615
1108081,10,9.96934
674126,10,9.967471
429061,10,9.838574
629934,10,9.839849
355145,10,9.857074
1307817,11,10.186847
282433,10,9.881597
533923,10,9.776075
1346234,10,9.977407


In [119]:
#Out-of-sample RMSE
score_jocas = french_model.evaluate(x_test, y_test)
print("Test loss:", score_jocas[0])
print("Test RMSE:", score_jocas[1])

Test loss: 0.00038792070699855685
Test RMSE: 0.0196957029402256


In [120]:
#Out-of-sample coefficient of variation
from sklearn.metrics import r2_score
r2_jocas = r2_score(y_result['ln_salary'], y_result['predictions'])
print(r2_jocas)

0.5870742064067624


## Out-of-sample performance metrics for the Pole Emploi skills data

In [38]:
# Training the model with the Pole Emploi skills data
checkpoint = tf.keras.callbacks.ModelCheckpoint("pe_french_model", monitor="val_loss", mode='min', save_best_only=True)
pe_french_model.compile(loss='mean_squared_error', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])
pe_french_model.fit(x_train, y_train, batch_size=16, epochs=4, validation_data=(x_test, y_test), callbacks=[checkpoint], initial_epoch=3)

Epoch 4/4


2023-09-20 07:52:41.425180: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model_1/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/roberta_pack_inputs/StatefulPartitionedCall/Trimmer/Trim/RoundRobinTrimmer/generate_mask/RaggedConcat/assert_equal_1/Assert/AssertGuard/branch_executed/_8591




2023-09-20 09:36:27.024456: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: model_1/keras_layer/StatefulPartitionedCall/StatefulPartitionedCall/StatefulPartitionedCall/roberta_pack_inputs/StatefulPartitionedCall/Trimmer/Trim/RoundRobinTrimmer/generate_mask/RaggedConcat/assert_equal_1/Assert/AssertGuard/branch_executed/_1055
2023-09-20 10:00:59.671012: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_2' with dtype string and shape [?]
	 [[{{node input_2}}]]
2023-09-20 10:00:59.904737: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' w

INFO:tensorflow:Assets written to: pe_french_model_v2/assets


INFO:tensorflow:Assets written to: pe_french_model_v2/assets




<keras.callbacks.History at 0x7f99c45c1d90>

In [43]:
#Out-of-sample predictions
pe_french_model = tf.keras.models.load_model('pe_french_model')
y_preds_pe = pe_french_model.predict(x_test)
y_result=pd.DataFrame()
y_result['ln_salary']=y_test
y_result['predictions']=y_preds_pe
y_result.head(10)

2023-09-20 23:12:12.891364: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'trimmer_trim_roundrobintrimmer_generate_mask_while_tile_multiples_trimmer_trim_roundrobintrimmer_generate_mask_strided_slice_0' with dtype int32
	 [[{{node trimmer_trim_roundrobintrimmer_generate_mask_while_tile_multiples_trimmer_trim_roundrobintrimmer_generate_mask_strided_slice_0}}]]
2023-09-20 23:12:12.891469: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'trimmer_trim_roundrobintrimmer_generate_mask_while_tile_multiples_trimmer_trim_roundrobintrimmer_generate_mask_strided_slice_0' with dtype int32
	 [[{{node trimmer_trim_roundrobintrimmer_



Unnamed: 0,ln_salary,predictions
86809,9.907915,9.947592
115890,9.977667,9.870551
41511,9.894185,9.870551
298568,10.181119,9.950929
87130,9.86804,9.878034
133646,9.934946,9.945965
41985,9.980449,9.86636
340769,9.92329,9.947592
362156,9.952278,9.948
79635,9.907915,9.947592


In [44]:
#Out-of-sample coefficient of variation
from sklearn.metrics import r2_score
r2_score = r2_score(y_result['ln_salary'], y_result['predictions'])
print(r2_score)

0.08050442166834648


## Out-of-sample performance metrics with the job description of the Pole Emploi data

In [44]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_result['ln_salary'], y_result['predictions'])
print(r2_score)

0.08050442166834648
