In [1]:

# Turn off Pretty Print, enable inline plotting with Matplotlib, add py folder to system path, import jobpostlib modules
%pprint
%matplotlib inline
import sys
if ('../py' not in sys.path): 
    sys.path.insert(1, '../py')
from jobpostlib import (
    crf, cu, datetime, duration, hau, hc, humanize, ihu, lru, nu, osp, 
    scrfcu, slrcu, ssgdcu, su, t0, time, wsu, speech_engine
)
from huggingface_hub import login
from pandas import DataFrame
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (
    AlbertForSequenceClassification,
    AlbertTokenizer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DistilBertForSequenceClassification,
    DistilBertTokenizer,
    Trainer,
    TrainingArguments
)
import fasttext
import fasttext.util
import numpy as np
import torch

Pretty printing has been turned OFF
Utility libraries created in 5 seconds


In [2]:

# Check if the lru has built its is-qualified classifier
t1 = time.time()
if not (hasattr(lru, 'ISQUALIFIED_LR') and hasattr(lru, 'ISQUALIFIED_CV')):
    lru.build_isqualified_logistic_regression_elements(sampling_strategy_limit=None, verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified LR elements built in {duration_str}'; print(speech_str)

I have 532,546 is-qualified vocabulary tokens in here
Is-qualified LR elements built in 5 seconds



---
# Training

In [3]:

# You need to run this again if you changed the qualification dictionary below or in another notebook
t1 = time.time()

# Keep the total retraining time to less than two minutes by adjusting the sampling strategy limit
lru.sync_basic_quals_dict(sampling_strategy_limit=None, verbose=False)

lru.retrain_isqualified_classifier(verbose=True)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
speech_str = f'Is-qualified classifer retrained in {duration_str}'; print(speech_str)

I have 19,430 hand-labeled qualification strings in here
I have 543,710 is-qualified vocabulary tokens in here
Is-qualified classifer retrained in 9 seconds



----

In [4]:

write_access_token = wsu.secrets_json['huggingface']['write_access_token']
login(token=write_access_token, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (manager).
Your token has been saved to C:\Users\daveb\.cache\huggingface\token
Login successful


In [5]:

# Split data into features and target
df = lru.basic_quals_df.sample(1_000)

# The shape of the Bag-of-words count vector here
# should be n html strings * m unique tokens
sents_list = df.qualification_str.tolist()

# Re-transform the bag-of-words and tf-idf from the new manual scores
bow_matrix = lru.ISQUALIFIED_CV.fit_transform(sents_list)

# Tf-idf must get from Bag-of-words first
tfidf_matrix = lru.ISQUALIFIED_TT.fit_transform(bow_matrix).toarray()

y = df.is_qualified.to_numpy().astype(int)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

In [6]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier()
}

# Evaluate models
results = []
for model_name, model in models.items():
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()

    start_infer = time.time()
    y_pred = model.predict(X_test)
    end_infer = time.time()

    recall = recall_score(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Training Time (s)": end_train - start_train,
        "Inference Time (s)": end_infer - start_infer,
        "Recall": recall
    })

# Display results
display(DataFrame(results))

Unnamed: 0,Model,Training Time (s),Inference Time (s),Recall
0,Logistic Regression,1.826463,0.014592,1.0
1,Naive Bayes,0.57728,0.246381,0.701613
2,Decision Tree,5.573913,0.028932,0.693548
3,Random Forest,10.674767,0.052678,0.967742
4,Gradient Boosting,139.224958,0.080412,0.862903


In [6]:

# Custom Dataset Class for Hugging Face Models
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        qualification_str = self.data.iloc[index]['qualification_str']
        is_qualified = self.data.iloc[index]['is_qualified']
        encoding = self.tokenizer(
            qualification_str,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(is_qualified, dtype=torch.long)
        }

In [7]:

# Function to evaluate a Hugging Face model (DistilBERT, ALBERT)
def evaluate_transformer_model(model, tokenizer, train_df, test_df, batch_size=8, max_length=128):
    # Prepare datasets
    train_dataset = CustomDataset(train_df, tokenizer, max_length)
    test_dataset = CustomDataset(test_df, tokenizer, max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Training
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = torch.nn.CrossEntropyLoss()

    # Measure training time
    start_train_time = time.time()
    model.train()
    for epoch in range(1):  # For evaluation purposes, use 1 epoch
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
    train_time = time.time() - start_train_time

    # Inference
    model.eval()
    all_preds, all_labels = [], []
    start_inference_time = time.time()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    inference_time = time.time() - start_inference_time

    recall = recall_score(all_labels, all_preds)
    return train_time, inference_time, recall

In [8]:

# Function to evaluate FastText
def evaluate_fasttext_model(train_df, test_df):
    # Save training data to file
    train_df[['qualification_str', 'is_qualified']].to_csv('train_fasttext.txt', sep='\t', index=False, header=False)
    
    # Train FastText model
    start_train_time = time.time()
    model = fasttext.train_supervised('train_fasttext.txt')
    train_time = time.time() - start_train_time

    # Inference
    start_inference_time = time.time()
    preds = [model.predict(text)[0][0][-1] for text in test_df['qualification_str']]
    inference_time = time.time() - start_inference_time

    recall = recall_score(test_df['is_qualified'], [int(p) for p in preds])
    return train_time, inference_time, recall

In [None]:

# Split data
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

# Evaluate DistilBERT
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
distilbert_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
distilbert_train_time, distilbert_inference_time, distilbert_recall = evaluate_transformer_model(
    distilbert_model, distilbert_tokenizer, train_df, test_df
)

# Evaluate ALBERT
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
albert_train_time, albert_inference_time, albert_recall = evaluate_transformer_model(
    albert_model, albert_tokenizer, train_df, test_df
)

# Evaluate FastText
fasttext_train_time, fasttext_inference_time, fasttext_recall = evaluate_fasttext_model(train_df, test_df)

# Output the results
print("DistilBERT - Training Time: {:.2f}s, Inference Time: {:.2f}s, Recall: {:.2f}".format(
    distilbert_train_time, distilbert_inference_time, distilbert_recall
))
print("ALBERT - Training Time: {:.2f}s, Inference Time: {:.2f}s, Recall: {:.2f}".format(
    albert_train_time, albert_inference_time, albert_recall
))
print("FastText - Training Time: {:.2f}s, Inference Time: {:.2f}s, Recall: {:.2f}".format(
    fasttext_train_time, fasttext_inference_time, fasttext_recall
))

In [None]:

# Define the models to evaluate
models = {
    "Gemma": "google/gemma-7b",
    "Mistral": "mistral-7b",
    "OLMo": "allenai/olmo-7b"
}

# Prepare the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Function to evaluate a model
def evaluate_model(model_name, model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    def tokenize_function(examples):
        return tokenizer(examples["qualification_str"], padding="max_length", truncation=True)

    train_encodings = tokenize_function(train_df)
    test_encodings = tokenize_function(test_df)

    train_dataset = DataFrame(train_encodings)
    test_dataset = DataFrame(test_encodings)

    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=lambda p: {"recall": recall_score(p.label_ids, p.predictions.argmax(-1))}
    )

    # Measure training speed
    start_time = time.time()
    trainer.train()
    training_time = time.time() - start_time

    # Measure inference speed
    start_time = time.time()
    predictions = trainer.predict(test_dataset)
    inference_time = time.time() - start_time

    # Measure inference recall
    recall = recall_score(test_df["is_qualified"], predictions.predictions.argmax(-1))

    return training_time, inference_time, recall

# Evaluate each model
results = {}
for model_name, model_checkpoint in models.items():
    training_time, inference_time, recall = evaluate_model(model_name, model_checkpoint)
    results[model_name] = {
        "Training Time (s)": training_time,
        "Inference Time (s)": inference_time,
        "Recall": recall
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value}")
    print()

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
