# Forecasting AI and ML Job Trends with SARIMA

At this stage, we handle imballances, train various models and evaluate them

## Dependencies

In [66]:
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm  # Or use tqdm instead of tqdm.notebook

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    roc_curve,
    roc_auc_score,
)
from sklearn.preprocessing import label_binarize

import torch

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate

import warnings

In [67]:
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

## Data Loading

In [68]:
filename = "data/c_job_postings_dataset_cleaned.parquet"
data = pd.read_parquet(filename)

In [69]:
print(f"{len(data):,} job postings loaded from {filename}")
data.sample(5)

1,296,381 job postings loaded from data/c_job_postings_dataset_cleaned.parquet


Unnamed: 0,text,label
1035590,high net worth tax manager accountant tax cybe...,0
793587,hair stylist high glen barber job humanity cos...,0
763554,assistant manager freight flow contact clerk w...,0
954922,specialty sale executive hybrid sale agent bus...,0
91549,hybrid senior structural engineer transportati...,0


## Handling Class Imballance

In [70]:
class ImbalanceHandler:
    def __init__(self, data, text_column, label_column):
        self.data = data
        self.text_column = text_column
        self.label_column = label_column

        # Vectorize text data and apply SMOTE
        self.vectorizer = TfidfVectorizer(max_features=5000)
        # Reduce majority class to 35% of the dataset
        self.undersample = RandomUnderSampler(sampling_strategy=0.35, random_state=42)
        # Oversample minority class using SMOTE
        self.smote = SMOTE(random_state=42)
        self.pipeline = Pipeline(
            [
                ("tfidf", self.vectorizer),
                ("undersample", self.undersample),
                ("smote", self.smote),
            ]
        )

    def handle_imbalance(self):
        # Extract features and labels
        X = self.data[self.text_column]
        y = self.data[self.label_column]

        # Apply the pipeline
        print("Handling imbalance...")
        for _ in tqdm(range(1), desc="Resampling"):
            X_res, y_res = self.pipeline.fit_resample(X, y)

        # Include the original text and resampled labels
        print("Creating resampled dataset...")
        resampled_data = pd.DataFrame({self.text_column: X.loc[y_res.index], self.label_column: y_res})
        
        print('Completed handling class imbalance')

        return X_res, y_res, resampled_data

In [71]:
def plot_class_distribution(cleaned_data, y_resampled):
    _, axes = plt.subplots(1, 2, figsize=(16, 5))

    # Visualize the class imbalance before handling
    cleaned_data['label'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
    axes[0].set_title('Class Distribution Before Handling Imbalance')
    axes[0].set_xlabel('Class')
    axes[0].set_ylabel('Frequency')

    # Visualize the class distribution after handling imbalance
    pd.Series(y_resampled).value_counts().plot(kind='bar', ax=axes[1], color='salmon')
    axes[1].set_title('Class Distribution After Handling Imbalance')
    axes[1].set_xlabel('Class')
    axes[1].set_ylabel('Frequency')

    plt.tight_layout()
    plt.show()

In [None]:
imbalanceHandler = ImbalanceHandler(data, 'text', 'label')
X_res, y_res, resampled_data = imbalanceHandler.handle_imbalance()


# Check the distribution of the new dataset
print(f"Original dataset shape: {Counter(data['label'])}")
print(f"Resampled dataset shape: {Counter(y_res)}")

plot_class_distribution(data, y_res)

Handling imbalance...


Resampling:   0%|          | 0/1 [00:00<?, ?it/s]

## Label Encoder

In [None]:
le = preprocessing.LabelEncoder()
le.fit(y_res)
y = le.transform(y_res)

print(f"Classes: {le.classes_}")

## Random Forest Classifier

Split the resampled data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.25, random_state=42
)

Train the model

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

Use the model to make predictions

In [None]:
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

Evaluate model performance

In [None]:
def evaluate_model(y_true, y_pred, y_pred_proba, le):
    print(classification_report(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)

    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    roc_auc = roc_auc_score(y_true, y_pred_proba)

    _, axes = plt.subplots(1, 2, figsize=(12, 5))

    # Plot confusion matrix
    disp.plot(cmap='Blues', xticks_rotation=45, ax=axes[0])
    axes[0].set_title('Confusion Matrix')

    # Plot ROC curve
    axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('False Positive Rate')
    axes[1].set_ylabel('True Positive Rate')
    axes[1].set_title('Receiver Operating Characteristic (ROC) Curve')
    axes[1].legend(loc='lower right')

    plt.tight_layout()
    plt.show()

evaluate_model(y_test, y_pred, y_pred_proba, le)

**Highlights:**
- At 98% accuracy, the model performs very well overall.
- Balanced Performance: Precision, recall, and F1-scores are all high and similar across both classes.
- Strong Class Distribution Handling: Despite potential class imbalance (similar but not identical support for 0 and 1), the model handles both classes effectively.
- The AUC is approximately 1.00, indicating excellent performance. 
- The curve's shape and position near the top-left corner further emphasize the model's strong predictive power.

##  Pretrained Transformer Model

In [None]:
model_name = "prajjwal1/bert-tiny"
device = torch.device("cpu")

#### Dataset Preparation

Label handling

In [None]:
le_t = preprocessing.LabelEncoder()
le_t.fit(resampled_data['label'].tolist())
resampled_data['label'] = le.transform(resampled_data['label'].tolist())

Convert to Huggingface Dataset

In [None]:
df_train, df_test = train_test_split(resampled_data, test_size=0.25, random_state=42)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")

In [None]:
train_dataset.shape, test_dataset.shape

#### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Initialize Model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)
print(f"Training model on {device}")

#### Model Training

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=2,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # Adjust epochs for faster completion
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False,  # Enable progress bar
    gradient_accumulation_steps=4,  # Simulate larger batch size
    dataloader_num_workers=2  # Optimize dataloader performance
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
torch.mps.empty_cache()

trainer.train()
trainer.save_model("ai_ml_job_classiffier")

### Model Evaluation

In [None]:
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()

print("Training metrics")
print(classification_report(GT,preds))

In [None]:
preds = trainer.predict(tokenized_test)
y_pred = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
y_true = df_test['label'].tolist()

evaluate_model(y_true, y_pred, preds, le_t)