# Forecasting AI and ML Job Trends with SARIMA

At this stage, we perform Sentiment and Context Analysis using NLP Techniques

- ***Contextual Skill Analysis***: Uses Named Entity Recognition (NER) to understand how AI skills are described in job postings.
- ***Sentiment Analysis***: Determines employer sentiment around AI skills (e.g., "essential," "preferred") to assess demand urgency.

Model used: **BERT**

## Dependencies

In [30]:
import re
from collections import Counter
import time

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset

## Data Loading

In [39]:
filename = "data/b_job_postings_ai_ml_ds.parquet"
ai_ml_jobs = pd.read_parquet(filename)

In [49]:
print(f"{len(ai_ml_jobs):,} job postings loaded from {filename}")
ai_ml_jobs.sample(5)

62,621 job postings loaded from data/b_job_postings_ai_ml_ds.parquet


Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,skills_count,keyword_count,keyword_likelihood,job_description,label
51957,2024-01-19 14:00:03.539492+00,t,t,f,sr. ml engineer,typeface,"palo alto, ca",2024-01-17,santa clara,united states,top former,mid senior,onsite,"[machine learning, generative ai, python, c/c+...",13,4,2,"machine learning, generative ai, python, c/c++...",1
450318,2024-01-19 09:45:09.215838+00,t,t,f,lead quality engineer,dmsi software,"omaha, ne",2024-01-12,nebraska,united states,aeronautical-design engineer,mid senior,onsite,"[quality assurance, testing, automation framew...",30,2,1,"quality assurance, testing, automation framewo...",1
1001067,2024-01-19 16:05:03.161067+00,t,t,f,"anesthesiologist – portland, or",sound physicians,"portland, or",2024-01-14,newberg,united states,anesthesiologist,mid senior,onsite,"[anesthesiology, orthopedics, general anesthes...",24,2,1,"anesthesiology, orthopedics, general anesthesi...",1
89433,2024-01-19 17:08:51.087778+00,t,t,f,spacecraft engineer - adcs,evona,"westminster, co",2024-01-14,boulder,united states,reliability engineer,mid senior,onsite,[attitude determination and control engineerin...,19,3,2,attitude determination and control engineering...,1
204192,2024-01-19 09:45:09.215838+00,t,t,f,area bridge digital design and visualization lead,hdr,greater cleveland,2024-01-13,cleveland heights,united states,modeler,mid senior,onsite,"[digital design & delivery, bim, 3d modeling, ...",28,2,1,"digital design & delivery, bim, 3d modeling, s...",1


## Job prediction using BERT

In [61]:
ai_ml_jobs['job_description'] = ai_ml_jobs['job_skills'].apply(lambda x: ', '.join(x))

# Assign labels based on keyword likelihood
ai_ml_jobs['label'] = (ai_ml_jobs['keyword_likelihood'] > 2).astype(int)

ai_ml_jobs.sample(5)

# Display count of labels
print(ai_ml_jobs['label'].value_counts())

label
0    46966
1    15655
Name: count, dtype: int64


In [59]:
ai_ml_jobs.sample(5)

Unnamed: 0,last_processed_time,got_summary,got_ner,is_being_worked,job_title,company,job_location,first_seen,search_city,search_country,search_position,job_level,job_type,job_skills,skills_count,keyword_count,keyword_likelihood,job_description,label
94833,2024-01-19 17:36:09.004735+00,t,t,f,block advisor accelerated tax associate,h&r block,"amarillo, tx",2024-01-14,amarillo,united states,consultant education,mid senior,onsite,"[tax preparation, client service, h&r block in...",16,2,1,"tax preparation, client service, h&r block inc...",0
15237,2024-01-20 00:44:54.457906+00,t,t,f,programming analyst,akkodis,"halifax, nova scotia, canada",2024-01-14,nova scotia,canada,economist,mid senior,onsite,"[.net, java, javascript, node.js, python, gola...",18,2,1,".net, java, javascript, node.js, python, golan...",0
876543,2024-01-20 02:44:22.969666+00,t,t,f,assistant professor of data science,university of oregon,"eugene, or",2024-01-16,oregon,united states,extension clerk,mid senior,onsite,"[data science, statistics, machine learning, a...",41,5,3,"data science, statistics, machine learning, ar...",0
813922,2024-01-20 04:30:03.675205+00,t,t,f,"lead software engineer, full stack(javascript/...",jobs for humanity,"hartford, ct",2024-01-14,wethersfield,united states,agricultural-research engineer,mid senior,onsite,"[javascript, java, aws, microservices, cloud c...",16,5,3,"javascript, java, aws, microservices, cloud co...",0
945965,2024-01-20 07:29:31.768585+00,t,t,f,product architect - advanced manufacturing - p...,ey,"austin, tx",2024-01-16,austin,united states,lead former,mid senior,onsite,"[product architecture, product requirements do...",26,5,3,"product architecture, product requirements doc...",0


In [None]:

# Sample dataset with job descriptions (replace with actual data)
data = [
    {"job_description": "Machine learning engineer with experience in deep learning, AI, and neural networks.", "label": 1},
    {"job_description": "Software developer skilled in Python and web development.", "label": 0},
    {"job_description": "AI researcher specializing in NLP and deep learning models.", "label": 1},
    {"job_description": "Data analyst with experience in SQL and data visualization.", "label": 0},
    {"job_description": "Computer vision engineer with expertise in object detection and AI.", "label": 1},
    {"job_description": "Frontend developer with skills in JavaScript, React, and UI/UX design.", "label": 0}
]

# Convert data into a Dataset object
dataset = Dataset.from_dict(data)

# Split the data into training and test sets
train_data, test_data = train_test_split(dataset, test_size=0.2)

# Load the BERT tokenizer and model for sequence classification (binary classification: AI skills or not)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize the job descriptions
def tokenize_function(examples):
    return tokenizer(examples["job_description"], padding="max_length", truncation=True, max_length=512)

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Set the format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=16,  
    warmup_steps=500,               
    weight_decay=0.01,              
    logging_dir='./logs',           
)

# Define a compute_metrics function for evaluation
def compute_metrics(p):
    preds = torch.argmax(p.predictions, axis=-1)
    return accuracy_score(p.label_ids, preds)

# Define the Trainer
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_data,         
    eval_dataset=test_data,            
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Print the results
print(f"Evaluation Results: {results}")

# Test the model on a sample job description
sample_job = "Looking for a data scientist skilled in machine learning and data analysis."
inputs = tokenizer(sample_job, return_tensors="pt", padding=True, truncation=True, max_length=512)
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1)

# Output the prediction
print(f"Predicted label for sample job: {'AI skills required' if prediction.item() == 1 else 'No AI skills required'}")
