## Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

## Config

In [2]:
# Load the config file
with open('../config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

 ## Datasets

### Inference batch

In [3]:
# Define file path
file_name = "test_unlabeled.tsv"
final_path = os.path.join(file_path, file_name) 

# Load tsv file
inference_batch = pd.read_csv(final_path, sep='\t')
print(f"The inference batch has {inference_batch.shape[0]} observations and {inference_batch.shape[1]} columns.")
inference_batch.head()

The inference batch has 1097 observations and 4 columns.


Unnamed: 0,PMID,Title,Abstract,Label
0,34902587,Detection of porcine circovirus type 3 DNA in ...,Porcine circovirus type 3 (PCV3) is regularly ...,0
1,35451025,Imputation of non-genotyped F1 dams to improve...,This study investigated using imputed genotype...,0
2,34859764,Proposed multidimensional pain outcome methodo...,Castration of male piglets in the United State...,0
3,35143972,Nanostructured lipid carriers loaded with an a...,Alopecia is a condition associated with differ...,0
4,34872491,Genome-wide expression of the residual lung re...,BACKGROUND: Acute or chronic irreversible resp...,0


### Training Corpus

In [32]:
# Define file path
file_name = "QTL_text.json"
final_path = os.path.join(file_path, file_name) 

# Load json file
df = pd.read_json(final_path)
df = df.drop(columns=['Journal'])
print(f"Shape of the original dataset: {df.shape}", "\n")
df.head()

Shape of the original dataset: (11278, 4) 



Unnamed: 0,PMID,Title,Abstract,Category
0,17179536,Variance component analysis of quantitative tr...,"In a previous study, QTL for carcass compositi...",1
1,17177700,"Single nucleotide polymorphism identification,...",Pituitary adenylate cyclase-activating polypep...,0
2,17129674,Genetic resistance to Sarcocystis miescheriana...,Clinical and parasitological traits of Sarcocy...,0
3,17121599,Results of a whole-genome quantitative trait l...,A whole-genome quantitative trait locus (QTL) ...,1
4,17057239,Unexpected high polymorphism at the FABP4 gene...,Fatty acid bing protein 4 (FABP4) plays a key ...,0


## Modeling Exploratory

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
from sklearn.model_selection import train_test_split, cross_val_score

### Train-Test Split

In [37]:
# Define predictor and target features
X = df.drop(columns=['Category'])
y = df['Category']

# Split train and test
X_train_corpus, X_test, y_train_corpus, y_test = train_test_split(X,y, test_size=.2, random_state=42, stratify=y)

# Split train and validation
X_train, X_val, y_train, y_val = train_test_split(X_train_corpus,y_train_corpus, test_size=.2, random_state=42, stratify=y_train_corpus)

#### Load Pre-Trained Model

In [38]:
# Define pre-trained model path
model_path = "google-bert/bert-base-uncased"

# Load model tokeninzer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with binary classification head
id2label = {0: "Not Related", 1: "Related"}
label2id = {"Not Related": 0, "Related": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Set Trainable Parameters

In [39]:
# Freeze all base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad=False

# Unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad=True

#### Data Pre-Processing

In [23]:
from datasets import DatasetDict, Dataset, load_dataset

In [41]:
# Training Data
train_data = {"text": X_train['Title'], "labels": y_train}
train_dataset = Dataset.from_dict(train_data)

# Validation Data
val_data = {"text": X_val['Title'], "labels": y_val}
val_dataset = Dataset.from_dict(val_data)

# Test Data
test_data = {"text": X_test['Title'], "labels": y_test}
test_dataset = Dataset.from_dict(test_data)

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 7217
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 1805
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2256
    })
})

In [42]:
# Define text preprocessing
def preprocess_function(examples):
    # Return tokenized text with truncation
    return tokenizer(
        examples['text'], 
        truncation=True) # Truncate abstracts greater than 512 tokens

# Preprocess all datasets
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

# Create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # Uniform sample lenght

Map:   0%|          | 0/7217 [00:00<?, ? examples/s]

Map:   0%|          | 0/1805 [00:00<?, ? examples/s]

Map:   0%|          | 0/2256 [00:00<?, ? examples/s]

### Define Evaluation Metrics 

In [43]:
# Load metrics
f1_score = evaluate.load("f1", config="macro")
auc_score = evaluate.load("roc_auc")

In [44]:
def compute_metrics(eval_pred):
    # Get predictions
    predictions, labels = eval_pred

    # Apply softmax to get probabilities
    probabilities = np.exp(predictions)  / np.exp(predictions).sum(-1, keepdims=True)

    # Use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]

    # Compute AUC
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'], 3)


    # Predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)

    # Compute Accuracy
    f1 = np.round(f1_score.compute(predictions=predicted_classes, references=labels)['f1'], 4)

    return {"F1": f1, "AUC": auc}

### Training Parameters

In [53]:
# Hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="experiment_outputs",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Fine-Tune Model

In [54]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [55]:
# trainer.train()
# To resume from the last checkpoint in your output_dir:
trainer.train(resume_from_checkpoint=True)

Epoch,Training Loss,Validation Loss,F1,Auc
6,0.1817,0.161634,0.6349,0.944
7,0.1738,0.161501,0.5938,0.949
8,0.168,0.15994,0.6067,0.95
9,0.1748,0.152787,0.6526,0.949
10,0.1687,0.15258,0.6538,0.95


TrainOutput(global_step=9030, training_loss=0.08670167975779519, metrics={'train_runtime': 1992.2586, 'train_samples_per_second': 36.225, 'train_steps_per_second': 4.533, 'total_flos': 1517765886730980.0, 'train_loss': 0.08670167975779519, 'epoch': 10.0})

### Test Data

In [None]:
# Apply model to validation dataset
predictions = trainer.predict(tokenized_data["test"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Compute metrics
metrics = compute_metrics((logits, labels))
print(metrics)