# Forecasting AI and ML Job Trends with SARIMA

At this stage, we perform Text Classification using the **DistilBERT**

## Dependencies

In [1]:
import os
import pandas as pd

import tensorflow as tf
from datasets import Dataset

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


## Data Loading

In [2]:
filename = "data/b_job_postings_with_labels.parquet"
job_postings = pd.read_parquet(filename)

In [3]:
print(f"{len(job_postings):,} job postings loaded from {filename}")
job_postings.sample(5)

1,296,381 job postings loaded from data/b_job_postings_with_labels.parquet


Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,skills_count,job_description,keyword_count,keyword_likelihood,label
796981,2024-01-20 03:26:50.076097+00,t,t,f,senior land development engineer,stantec,"london, ontario, canada",2024-01-14,london,canada,civil engineering technician,mid senior,onsite,"[land development, site grading, site servicin...",35,senior land development engineer civil enginee...,0,0,0
1233254,2024-01-21 04:31:36.376668+00,t,t,f,administrator,cld recruitment (leeds) ltd,"leeds, england, united kingdom",2024-01-14,maidstone,united kingdom,historic-site administrator,mid senior,onsite,"[mail processing, mail distribution, stationer...",15,administrator historic-site administrator cld ...,0,0,0
524186,2024-01-19 09:45:09.215838+00,t,t,f,acute care rn - intensive care unit *night shi...,health ecareers,"edmonds, wa",2024-01-12,everett,united states,anesthesiologist,mid senior,onsite,"[nursing, patient care, national provider bls,...",25,acute care rn - intensive care unit *night shi...,0,0,0
676916,2024-01-19 09:45:09.215838+00,t,t,f,yard associate (skillbridge),armor initiative,"jacksonville, fl",2024-01-13,jacksonville,united states,orderly,associate,onsite,"[yard associate, heavy equipment cleaning, yar...",18,yard associate (skillbridge) orderly armor ini...,0,0,0
71753,2024-01-19 15:38:31.076459+00,t,t,f,surgical technologist certified / non certifie...,mclaren health care,"mount clemens, mi",2024-01-16,macomb,united states,polysomnographic technician,mid senior,onsite,"[surgical technologist, surgical technology, b...",21,surgical technologist certified / non certifie...,0,0,0


## Job prediction using BERT

In [4]:
# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# DistilBERT model for classification
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [5]:
def save_tf_datasets(train_ds, test_ds, save_dir="./data/tf_datasets"):
    os.makedirs(save_dir, exist_ok=True)

    # Save datasets
    tf.data.Dataset.save(train_ds, os.path.join(save_dir, "train"))
    tf.data.Dataset.save(test_ds, os.path.join(save_dir, "test"))

    print(f"Datasets saved to {save_dir}")

In [6]:
def create_tf_datasets(job_postings, tokenizer):
    # Tokenize the data
    def tokenize_function(train_data):
        return tokenizer(
            train_data["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="tf",  # Ensure TF tensors
        )

    print("Tokenizing data and creating datasets...")
    # Convert data into a Dataset object
    data = job_postings[["job_description", "label"]]
    data = data.rename(columns={"job_description": "text"})
    dataset = Dataset.from_dict(data)

    # Split the dataset into training and testing sets
    train_test_split = dataset.train_test_split(test_size=0.2)
    train_data = train_test_split["train"]
    test_data = train_test_split["test"]

    # Prepare training and test datasets
    train_encoded = train_data.map(tokenize_function, batched=True)
    test_encoded = test_data.map(tokenize_function, batched=True)

    print(f"Training data: {len(train_encoded):,} samples")
    print(f"Testing data: {len(test_encoded):,} samples")

    print("Creating TensorFlow datasets...")
    # Convert labels to tensors
    train_labels = tf.convert_to_tensor(train_data["label"], dtype=tf.int32)
    test_labels = tf.convert_to_tensor(test_data["label"], dtype=tf.int32)

    # Create TensorFlow datasets
    train_dataset = (
        tf.data.Dataset.from_tensor_slices(
            (
                {
                    "input_ids": train_encoded["input_ids"],
                    "attention_mask": train_encoded["attention_mask"],
                },
                train_labels,
            )
        )
        .shuffle(100)
        .batch(8)
    )

    test_dataset = tf.data.Dataset.from_tensor_slices(
        (
            {
                "input_ids": test_encoded["input_ids"],
                "attention_mask": test_encoded["attention_mask"],
            },
            test_labels,
        )
    ).batch(16)

    save_tf_datasets(train_dataset, test_dataset)

    return train_dataset, test_dataset

In [7]:
def load_tf_datasets(job_postings=None, tokenizer=None, load_dir="./data/tf_datasets3"):
    try:
        if not os.path.exists(load_dir):
            raise FileNotFoundError
            
        train_dataset = tf.data.Dataset.load(os.path.join(load_dir, "train"))
        test_dataset = tf.data.Dataset.load(os.path.join(load_dir, "test"))

        print("Datasets loaded from disk")

    except (FileNotFoundError, NameError):
        if job_postings is None or tokenizer is None:
            raise ValueError(
                "job_postings and tokenizer required when datasets not found"
            )

        print("Creating new datasets...")
        train_dataset, test_dataset = create_tf_datasets(job_postings, tokenizer)

    # Reapply dataset operations
    train_dataset = train_dataset.shuffle(100).batch(8)
    test_dataset = test_dataset.batch(16)

    return train_dataset, test_dataset

In [8]:
train_dataset, test_dataset = load_tf_datasets(job_postings[:10], tokenizer)

len(train_dataset), len(test_dataset)

Creating new datasets...
Tokenizing data and creating datasets...


Map: 100%|██████████| 8/8 [00:00<00:00, 533.91 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 437.43 examples/s]

Training data: 8 samples
Testing data: 2 samples
Creating TensorFlow datasets...
Datasets saved to ./data/tf_datasets





(1, 1)

In [9]:
# Model compilation
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "./results/tf_checkpoints/model.keras",
        save_best_only=True,
        monitor="val_accuracy",
        save_format="keras",
    ),
    tf.keras.callbacks.EarlyStopping(patience=2, monitor="val_loss"),
    tf.keras.callbacks.TensorBoard(log_dir="./logs", update_freq="batch"),
]

In [10]:
# Model training
history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset,
    verbose=1,
    callbacks=callbacks,
)

Epoch 1/3


ValueError: in user code:

    File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1381, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/engine/training.py", line 1370, in run_step  **
        outputs = model.train_step(data)
    File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 1672, in train_step
        y_pred = self(x, training=True)
    File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file3htvhr93.py", line 40, in tf__run_call_with_unpacked_inputs
        raise
    File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_filecfunk608.py", line 17, in tf__call
        distilbert_output = ag__.converted_call(ag__.ld(self).distilbert, (), dict(input_ids=ag__.ld(input_ids), attention_mask=ag__.ld(attention_mask), head_mask=ag__.ld(head_mask), inputs_embeds=ag__.ld(inputs_embeds), output_attentions=ag__.ld(output_attentions), output_hidden_states=ag__.ld(output_hidden_states), return_dict=ag__.ld(return_dict), training=ag__.ld(training)), fscope)
    File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file3htvhr93.py", line 40, in tf__run_call_with_unpacked_inputs
        raise
    File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_filemkqqrgwc.py", line 93, in tf__call
        embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
    File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file6wdrdefs.py", line 54, in tf__call
        final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)

    ValueError: Exception encountered when calling layer 'tf_distil_bert_for_sequence_classification' (type TFDistilBertForSequenceClassification).
    
    in user code:
    
        File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 801, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 809, in call  *
            distilbert_output = self.distilbert(
        File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
        File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file3htvhr93.py", line 40, in tf__run_call_with_unpacked_inputs
            raise
        File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_filemkqqrgwc.py", line 93, in tf__call
            embedding_output = ag__.converted_call(ag__.ld(self).embeddings, (ag__.ld(input_ids),), dict(inputs_embeds=ag__.ld(inputs_embeds)), fscope)
        File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file6wdrdefs.py", line 54, in tf__call
            final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
    
        ValueError: Exception encountered when calling layer 'distilbert' (type TFDistilBertMainLayer).
        
        in user code:
        
            File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/modeling_tf_utils.py", line 801, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 454, in call  *
                embedding_output = self.embeddings(input_ids, inputs_embeds=inputs_embeds)  # (bs, seq_length, dim)
            File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                raise e.with_traceback(filtered_tb) from None
            File "/var/folders/kd/9h3yrkps2vj0t9zs17vt349c0000gn/T/__autograph_generated_file6wdrdefs.py", line 54, in tf__call
                final_embeddings = ag__.converted_call(ag__.ld(self).LayerNorm, (), dict(inputs=ag__.ld(final_embeddings)), fscope)
        
            ValueError: Exception encountered when calling layer 'embeddings' (type TFEmbeddings).
            
            in user code:
            
                File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/transformers/models/distilbert/modeling_tf_distilbert.py", line 119, in call  *
                    final_embeddings = self.LayerNorm(inputs=final_embeddings)
                File "/Users/mzitoh/Desktop/USD-Source/data_mining/ai-ml-job-trends/.venv/lib/python3.11/site-packages/tf_keras/src/utils/traceback_utils.py", line 70, in error_handler  **
                    raise e.with_traceback(filtered_tb) from None
            
                ValueError: Exception encountered when calling layer 'LayerNorm' (type LayerNormalization).
                
                Cannot reshape a tensor with 768 elements to shape [1,1,128,1] (128 elements) for '{{node tf_distil_bert_for_sequence_classification/distilbert/embeddings/LayerNorm/Reshape}} = Reshape[T=DT_FLOAT, Tshape=DT_INT32](tf_distil_bert_for_sequence_classification/distilbert/embeddings/LayerNorm/Reshape/ReadVariableOp, tf_distil_bert_for_sequence_classification/distilbert/embeddings/LayerNorm/Reshape/shape)' with input shapes: [768], [4] and with input tensors computed as partial shapes: input[1] = [1,1,128,1].
                
                Call arguments received by layer 'LayerNorm' (type LayerNormalization):
                  • inputs=tf.Tensor(shape=(None, None, 128, 768), dtype=float32)
            
            
            Call arguments received by layer 'embeddings' (type TFEmbeddings):
              • input_ids=tf.Tensor(shape=(None, None, 128), dtype=int32)
              • position_ids=None
              • inputs_embeds=None
              • training=True
        
        
        Call arguments received by layer 'distilbert' (type TFDistilBertMainLayer):
          • input_ids=tf.Tensor(shape=(None, None, 128), dtype=int32)
          • attention_mask=tf.Tensor(shape=(None, None, 128), dtype=int32)
          • head_mask=None
          • inputs_embeds=None
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
    
    
    Call arguments received by layer 'tf_distil_bert_for_sequence_classification' (type TFDistilBertForSequenceClassification):
      • input_ids={'input_ids': 'tf.Tensor(shape=(None, None, 128), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, None, 128), dtype=int32)'}
      • attention_mask=None
      • head_mask=None
      • inputs_embeds=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • labels=None
      • training=True


In [None]:
# Evaluate the model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

In [None]:
# Predict on a sample job description
sample_job = "Looking for a data scientist skilled in machine learning and data analysis."
encoded_input = tokenizer(sample_job, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
output = model(encoded_input)
prediction = tf.argmax(output.logits, axis=-1).numpy()[0]

print(f"Predicted label: {'AI skills required' if prediction == 1 else 'No AI skills required'}")