# Forecasting AI and ML Job Trends with SARIMA

At this stage, we perform Sentiment and Context Analysis using NLP Techniques

- ***Contextual Skill Analysis***: Uses Named Entity Recognition (NER) to understand how AI skills are described in job postings.
- ***Sentiment Analysis***: Determines employer sentiment around AI skills (e.g., "essential," "preferred") to assess demand urgency.

Model used: **BERT**

## Dependencies

In [34]:
import re
from collections import Counter
import time

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import tensorflow as tf
from datasets import Dataset

from transformers import TFBertForSequenceClassification, BertTokenizer
from transformers import logging

In [None]:
logging.set_verbosity_info()

## Data Loading

In [None]:
filename = "data/b_job_postings_with_labels.parquet"
job_postings = pd.read_parquet(filename)

In [None]:
print(f"{len(job_postings):,} job postings loaded from {filename}")
job_postings.sample(5)

## Job prediction using BERT

In [35]:
# Load the BERT tokenizer and model for sequence classification (binary classification)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

loading file vocab.txt from cache at /Users/mzitoh/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /Users/mzitoh/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer_config.json
loading file tokenizer.json from cache at /Users/mzitoh/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/tokenizer.json
loading file chat_template.jinja from cache at None
loading configuration file config.json from cache at /Users/mzitoh/.cache/huggingface/hub/models--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob"

In [36]:
# Tokenize the job descriptions
def tokenize_function(data):
    tokens = tokenizer(
        [item['text'] for item in data],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="tf"
    )
    labels = [item['label'] for item in data]
    return tokens, tf.convert_to_tensor(labels)


# Convert data into a Dataset object
data = job_postings[:10][["job_description", "label"]]
data = data.rename(columns={"job_description": "text"})

dataset = Dataset.from_dict(data)

# Split the dataset into training and testing sets
train_test_split = dataset.train_test_split(test_size=0.2)
train_data = train_test_split['train']
test_data = train_test_split['test']

print(f"Training data: {len(train_data):,} samples")
print(f"Testing data: {len(test_data):,} samples")

# Prepare training and test datasets
train_tokens, train_labels = tokenize_function(train_data)
test_tokens, test_labels = tokenize_function(test_data)

# Prepare TensorFlow Dataset objects
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_tokens), train_labels
)).batch(8)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_tokens), test_labels
)).batch(16)

Training data: 8 samples
Testing data: 2 samples


In [40]:
# Compile the model
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=["accuracy"])

In [None]:
# Custom training loop
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

@tf.function
def train_step(inputs):
	tokens, labels = inputs
	with tf.GradientTape() as tape:
		logits = model(tokens, training=True).logits
		loss = loss_fn(labels, logits)
	gradients = tape.gradient(loss, model.trainable_variables)
	optimizer.apply_gradients(zip(gradients, model.trainable_variables))
	train_acc_metric.update_state(labels, logits)
	return loss

@tf.function
def val_step(inputs):
	tokens, labels = inputs
	val_logits = model(tokens, training=False).logits
	val_acc_metric.update_state(labels, val_logits)

# Training loop
epochs = 3
for epoch in range(epochs):
	print(f"\nStart of epoch {epoch + 1}")
	
	# Training
	for step, batch in enumerate(train_dataset):
		loss = train_step(batch)
		if step % 50 == 0:
			print(f"Training loss at step {step}: {loss:.4f}")
	
	# Validation
	for batch in test_dataset:
		val_step(batch)
	
	train_acc = train_acc_metric.result()
	val_acc = val_acc_metric.result()
	print(f"Training accuracy: {train_acc:.4f}")
	print(f"Validation accuracy: {val_acc:.4f}")
	
	train_acc_metric.reset_states()
	val_acc_metric.reset_states() Custom training loop
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

@tf.function
def train_step(inputs):
	tokens, labels = inputs
	with tf.GradientTape() as tape:
		logits = model(tokens, training=True).logits
		loss = loss_fn(labels, logits)
	gradients = tape.gradient(loss, model.trainable_variables)
	optimizer.apply_gradients(zip(gradients, model.trainable_variables))
	train_acc_metric.update_state(labels, logits)
	return loss

@tf.function
def val_step(inputs):
	tokens, labels = inputs
	val_logits = model(tokens, training=False).logits
	val_acc_metric.update_state(labels, val_logits)

# Training loop
epochs = 3
for epoch in range(epochs):
	print(f"\nStart of epoch {epoch + 1}")
	
	# Training
	for step, batch in enumerate(train_dataset):
		loss = train_step(batch)
		if step % 50 == 0:
			print(f"Training loss at step {step}: {loss:.4f}")
	
	# Validation
	for batch in test_dataset:
		val_step(batch)
	
	train_acc = train_acc_metric.result()
	val_acc = val_acc_metric.result()
	print(f"Training accuracy: {train_acc:.4f}")
	print(f"Validation accuracy: {val_acc:.4f}")
	
	train_acc_metric.reset_states()
	val_acc_metric.reset_states() Custom training loop
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=3e-5)
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
val_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()

@tf.function
def train_step(inputs):
	tokens, labels = inputs
	with tf.GradientTape() as tape:
		logits = model(tokens, training=True).logits
		loss = loss_fn(labels, logits)
	gradients = tape.gradient(loss, model.trainable_variables)
	optimizer.apply_gradients(zip(gradients, model.trainable_variables))
	train_acc_metric.update_state(labels, logits)
	return loss

@tf.function
def val_step(inputs):
	tokens, labels = inputs
	val_logits = model(tokens, training=False).logits
	val_acc_metric.update_state(labels, val_logits)

# Training loop
epochs = 3
for epoch in range(epochs):
	print(f"\nStart of epoch {epoch + 1}")
	
	# Training
	for step, batch in enumerate(train_dataset):
		loss = train_step(batch)
		if step % 50 == 0:
			print(f"Training loss at step {step}: {loss:.4f}")
	
	# Validation
	for batch in test_dataset:
		val_step(batch)
	
	train_acc = train_acc_metric.result()
	val_acc = val_acc_metric.result()
	print(f"Training accuracy: {train_acc:.4f}")
	print(f"Validation accuracy: {val_acc:.4f}")
	
	train_acc_metric.reset_states()
	val_acc_metric.reset_states()

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 45)

In [None]:
# Evaluate the model
results = model.evaluate(test_dataset)
print(f"Evaluation Results: {results}")
