# load the dataset

In [None]:
%pip install tf2onnx --quiet

In [None]:
%pip install "keras-tuner"

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras_tuner as kt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import json
import tf2onnx
import onnx

# Pre Process dataset

In [12]:
# Load the dataset
data = pd.read_csv('/home/darshan/Documents/Master_Title_Department.csv')

data.info();

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3064321 entries, 0 to 3064320
Data columns (total 4 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   jobTitle       object
 1   jobLevel       object
 2   jobDepartment  object
 3   jobFunction    object
dtypes: object(4)
memory usage: 93.5+ MB


In [13]:
# Drop any rows with missing values in the jobTitle column
data = data.dropna(subset=['jobTitle'])

# Ensure jobTitle is of type string
data['jobTitle'] = data['jobTitle'].astype(str)

# Preprocess the data
label_encoder_level = LabelEncoder()
label_encoder_department = LabelEncoder()

data['jobLevel'] = label_encoder_level.fit_transform(data['jobLevel'])
data['jobDepartment'] = label_encoder_department.fit_transform(data['jobDepartment'])

# Split the data
x_train, x_test, y_train_level, y_test_level, y_train_department, y_test_department = train_test_split(
    data['jobTitle'], data['jobLevel'], data['jobDepartment'], test_size=0.01, random_state=42
)

# Tokenize the job titles
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad the sequences
max_length = max(len(seq) for seq in x_train_seq)
x_train_padded = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=max_length)
x_test_padded = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=max_length)

# Define Models

In [14]:
input_shape = (max_length,)

In [None]:
# Define the model for jobLevel
level_inputs = tf.keras.layers.Input(shape=input_shape)
level_hidden = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=56, input_length=max_length)(level_inputs)
level_hidden = tf.keras.layers.GlobalAveragePooling1D()(level_hidden)
level_hidden = tf.keras.layers.Dense(56, activation='relu')(level_hidden)
level_outputs = tf.keras.layers.Dense(len(label_encoder_level.classes_), activation='softmax')(level_hidden)

# Create the model
model_level = tf.keras.Model(inputs=level_inputs, outputs=level_outputs)

# compile & fit
model_level.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_level.fit(x_train_padded, y_train_level, epochs=2, validation_data=(x_test_padded, y_test_level))

2024-12-12 16:23:35.707515: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Epoch 1/2


2024-12-12 16:23:36.068218: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 266963576 exceeds 10% of free system memory.


[1m 8171/94803[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m35:42[0m 25ms/step - accuracy: 0.8826 - loss: 0.3673

In [None]:
# Define the model for jobDepartment
department_inputs = tf.keras.layers.Input(shape=input_shape)
department_hidden = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=56, input_length=max_length)(department_inputs)
department_hidden = tf.keras.layers.GlobalAveragePooling1D()(department_hidden)
department_hidden = tf.keras.layers.Dense(56, activation='relu')(department_hidden)
department_outputs = tf.keras.layers.Dense(len(label_encoder_department.classes_), activation='softmax')(department_hidden)

# Create the model
model_department = tf.keras.Model(inputs=department_inputs, outputs=department_outputs)

model_department.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_department.fit(x_train_padded, y_train_department, epochs=1, validation_data=(x_test_padded, y_test_department))

# Benchmark the model

In [None]:
# Predict the probabilities for the test set
y_pred_prob_level = model_level.predict(x_test_padded)
y_pred_prob_department = model_department.predict(x_test_padded)

# Convert the probabilities to class labels
y_pred_level = np.argmax(y_pred_prob_level, axis=1)
y_pred_department = np.argmax(y_pred_prob_department, axis=1)

# Calculate the accuracy for each model
accuracy_level = accuracy_score(y_test_level, y_pred_level)
accuracy_department = accuracy_score(y_test_department, y_pred_department)

# Calculate the overall accuracy
overall_accuracy = (accuracy_level + accuracy_department) / 2

print(f'Job Level Accuracy: {accuracy_level}')
print(f'Job Department Accuracy: {accuracy_department}')
print(f'Overall Accuracy: {overall_accuracy}')


In [None]:
from kerastuner import RandomSearch
from tensorflow.keras.callbacks import EarlyStopping

# Define a function to build the model with tunable parameters
def build_model(hp):
    level_inputs_2 = tf.keras.layers.Input(shape=input_shape)
    level_hidden_2 = tf.keras.layers.Embedding(
        input_dim=len(tokenizer.word_index) + 1, 
        output_dim=hp.Int('embedding_dim', min_value=8, max_value=64, step=8),
        input_length=max_length)(level_inputs_2)
    level_hidden_2 = tf.keras.layers.GlobalAveragePooling1D()(level_hidden_2)
    level_hidden_2 = tf.keras.layers.Dense(
        hp.Int('units', min_value=8, max_value=64, step=8), 
        activation='relu')(level_hidden_2)
    level_outputs_2 = tf.keras.layers.Dense(len(label_encoder_level.classes_), activation='softmax')(level_hidden_2)
    
    model = tf.keras.Model(inputs=level_inputs_2, outputs=level_outputs_2)
    model.compile(
        optimizer='adam', 
        loss='sparse_categorical_crossentropy', 
        metrics=['accuracy'])
    return model

# Set up Keras Tuner to search for best hyperparameters
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of different combinations to try
    executions_per_trial=1,
    directory='hyperparam_tuning',
    project_name='job_level_model')

# Run the hyperparameter search
tuner.search(x_train_padded, y_train_level, epochs=2, validation_data=(x_test_padded, y_test_level))

# Get the best model and retrain with more epochs
best_hp = tuner.get_best_hyperparameters()[0]
model_level_2 = tuner.hypermodel.build(best_hp)
early_stopping_2 = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model_level_2.fit(x_train_padded, y_train_level, epochs=2, batch_size=best_hp.get('batch_size'), 
                  validation_data=(x_test_padded, y_test_level), callbacks=[early_stopping_2])


In [None]:
# Predict the probabilities for the test set
y_pred_prob_level = model_level.predict(x_test_padded)

# Convert the probabilities to class labels
y_pred_level = np.argmax(y_pred_prob_level, axis=1)

# Calculate the accuracy for each model
accuracy_level = accuracy_score(y_test_level, y_pred_level)

print(f'Job Level Accuracy: {accuracy_level}')

# Save Models and config

In [None]:
# Save the tokenizer and label encoders

with open('/kaggle/working/tokenizer.json', 'w') as f:
    json.dump(tokenizer.to_json(), f)
    
with open('/kaggle/working/tokenizer_min.json', 'w') as f:
    tokenizer_min = {"word_index" : tokenizer.word_index, "filters" : tokenizer.filters}
    json.dump(tokenizer.to_json(), f)

with open('/kaggle/working/label_encoder_level.json', 'w') as f:
    json.dump(label_encoder_level.classes_.tolist(), f)

with open('/kaggle/working/label_encoder_department.json', 'w') as f:
    json.dump(label_encoder_department.classes_.tolist(), f)

In [None]:
# save in h5 format
model_department.save('/kaggle/working/department.h5')
model_level.save('/kaggle/working/level.h5')

In [None]:
# save in onnx format

# as both have same input sigmature
input_signature = [tf.TensorSpec(model_level.inputs[0].shape, model_level.inputs[0].dtype)]

onnx_model_department, _ = tf2onnx.convert.from_keras(model_department, input_signature, opset=13)
onnx.save(onnx_model_department, 'department.onnx')

onnx_mode_level, _ = tf2onnx.convert.from_keras(model_level, input_signature, opset=13)
onnx.save(onnx_mode_level, 'level.onnx')

# Testing

In [None]:
# Function to preprocess input job title
def preprocess_input(job_title, tokenizer, max_length):
    # Tokenize the input job title
    seq = tokenizer.texts_to_sequences([job_title])
    print(seq)
    # Pad the sequence
    padded_seq = tf.keras.preprocessing.sequence.pad_sequences(seq, maxlen=max_length)
    print(padded_seq)
    return padded_seq

# Example job title to predict
job_title = "CFO"

# Preprocess the input job title
input_seq = preprocess_input(job_title, tokenizer, max_length)
input_seq