# Forecasting AI and ML Job Trends with SARIMA

At this stage, we perform Text Classification using the **DistilBERT**

## Dependencies

In [None]:
import os
import pandas as pd

import tensorflow as tf
from datasets import Dataset

from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

## Data Loading

In [61]:
filename = "data/b_job_postings_with_labels.parquet"
job_postings = pd.read_parquet(filename)

In [62]:
print(f"{len(job_postings):,} job postings loaded from {filename}")
job_postings.sample(5)

1,296,381 job postings loaded from data/b_job_postings_with_labels.parquet


Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,skills_count,job_description,keyword_count,keyword_likelihood,label
401008,2024-01-19 09:45:09.215838+00,t,t,f,summer camp instructor,phillip and patricia frost museum of science,"miami, fl",2024-01-13,highland park,united states,genealogist,mid senior,onsite,"[science education, teaching, astronomy, aeron...",32,summer camp instructor genealogist phillip and...,1,0,0
520854,2024-01-19 09:45:09.215838+00,t,t,f,housekeeper (part time),touchpoint support services,"newburgh, in",2024-01-12,indiana,united states,cleaner,mid senior,onsite,"[cleaning, housekeeping, sweeping, scrubbing, ...",23,housekeeper (part time) cleaner touchpoint sup...,0,0,0
924911,2024-01-21 15:49:51.844651+00,t,t,f,food & beverage manager - casual dining,"full house resorts, inc","waukegan, il",2024-01-16,illinois,united states,food-and-beverage controller,mid senior,onsite,"[guest service, customer service, complaint re...",19,food & beverage manager - casual dining food-a...,0,0,0
100050,2024-01-19 18:02:51.789794+00,t,t,f,radiologic technologist,doylestown health,"doylestown, pa",2024-01-14,northampton,united states,radiologic technologist,mid senior,onsite,"[xray imaging, dexa imaging, digital radiograp...",12,radiologic technologist radiologic technologis...,0,0,0
353724,2024-01-19 09:45:09.215838+00,t,t,f,"family practice-with ob physician - $275,000/y...",doccafe,"rochester, ny",2024-01-13,williamson,united states,family practitioner,mid senior,onsite,"[physician, family practice, obstetrics, healt...",13,"family practice-with ob physician - $275,000/y...",0,0,0


## Job prediction using BERT

In [63]:
# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# DistilBERT model for classification
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [None]:
def create_tf_datasets():
    # Tokenize the data
    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="tf",  # Ensure TF tensors
        )

    print("Tokenizing data and creating datasets...")
    # Convert data into a Dataset object
    data = job_postings[["job_description", "label"]]
    data = data.rename(columns={"job_description": "text"})
    dataset = Dataset.from_dict(data)

    # Split the dataset into training and testing sets
    train_test_split = dataset.train_test_split(test_size=0.2)
    train_data = train_test_split["train"]
    test_data = train_test_split["test"]

    # Prepare training and test datasets
    train_encoded = train_data.map(tokenize_function, batched=True)
    test_encoded = test_data.map(tokenize_function, batched=True)

    print(f"Training data: {len(train_encoded):,} samples")
    print(f"Testing data: {len(test_encoded):,} samples")

    print("Creating TensorFlow datasets...")
    # Convert labels to tensors
    train_labels = tf.convert_to_tensor(train_data["label"], dtype=tf.int32)
    test_labels = tf.convert_to_tensor(test_data["label"], dtype=tf.int32)

    # Create TensorFlow datasets
    train_dataset = tf.data.Dataset.from_tensor_slices(
        (
            {
                "input_ids": train_encoded["input_ids"],
                "attention_mask": train_encoded["attention_mask"]
            },
            train_labels
        )
    ).shuffle(100).batch(8)

    test_dataset = tf.data.Dataset.from_tensor_slices(
        (
            {
                "input_ids": test_encoded["input_ids"],
                "attention_mask": test_encoded["attention_mask"]
            },
            test_labels
        )
    ).batch(16)
    
    def save_tf_datasets(train_ds, test_ds, save_dir='./data/tf_datasets'):
        os.makedirs(save_dir, exist_ok=True)
        
        # Save datasets
        tf.data.experimental.save(train_ds, os.path.join(save_dir, 'train'))
        tf.data.experimental.save(test_ds, os.path.join(save_dir, 'test'))
        
        print(f"Datasets saved to {save_dir}")

    save_tf_datasets(train_dataset, test_dataset)
    
    return train_dataset, test_dataset

Map: 100%|██████████| 1037104/1037104 [13:47<00:00, 1252.94 examples/s]
Map: 100%|██████████| 259277/259277 [03:27<00:00, 1251.48 examples/s]


Training data: 1,037,104 samples
Testing data: 259,277 samples


In [None]:
train_dataset, test_dataset = create_tf_datasets()

In [None]:
def save_tf_datasets(train_ds, test_ds, save_dir='./data/tf_datasets'):
        os.makedirs(save_dir, exist_ok=True)
        
        # Save datasets
        tf.data.experimental.save(train_ds, os.path.join(save_dir, 'train'))
        tf.data.experimental.save(test_ds, os.path.join(save_dir, 'test'))
        
        print(f"Datasets saved to {save_dir}")

save_tf_datasets(train_dataset, test_dataset)

In [None]:
def load_tf_datasets(job_postings=None, tokenizer=None, load_dir="./data/tf_datasets"):
    """Load or create TensorFlow datasets"""
    try:
        # Try loading saved datasets
        train_dataset = tf.data.experimental.load(
            os.path.join(load_dir, "train"), element_spec=train_dataset.element_spec
        )
        test_dataset = tf.data.experimental.load(
            os.path.join(load_dir, "test"), element_spec=test_dataset.element_spec
        )
    except (FileNotFoundError, NameError):
        # Create new datasets if loading fails
        if job_postings is None or tokenizer is None:
            raise ValueError(
                "job_postings and tokenizer required when datasets not found"
            )

        print("Creating new datasets...")
        train_dataset, test_dataset = create_tf_datasets(job_postings, tokenizer)

        # Save datasets
        os.makedirs(load_dir, exist_ok=True)
        tf.data.experimental.save(train_dataset, os.path.join(load_dir, "train"))
        tf.data.experimental.save(test_dataset, os.path.join(load_dir, "test"))
        print(f"Datasets saved to {load_dir}")

    # Reapply dataset operations
    train_dataset = train_dataset.shuffle(100).batch(8)
    test_dataset = test_dataset.batch(16)

In [None]:
# Model compilation
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        "./results/tf_checkpoints/model.keras",
        save_best_only=True,
        monitor="val_accuracy",
        save_format="keras",
    ),
    tf.keras.callbacks.EarlyStopping(patience=2, monitor="val_loss"),
    tf.keras.callbacks.TensorBoard(log_dir="./logs", update_freq="batch"),
]

# Model training
history = model.fit(
    train_dataset,
    epochs=3,
    validation_data=test_dataset,
    verbose=1,
    callbacks=callbacks,
)

In [None]:
# Evaluate the model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

In [None]:
# Predict on a sample job description
sample_job = "Looking for a data scientist skilled in machine learning and data analysis."
encoded_input = tokenizer(sample_job, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
output = model(encoded_input)
prediction = tf.argmax(output.logits, axis=-1).numpy()[0]

print(f"Predicted label: {'AI skills required' if prediction == 1 else 'No AI skills required'}")