## Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

## Config

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
from sklearn.model_selection import train_test_split, cross_val_score
from datasets import DatasetDict, Dataset, load_dataset
import torch

In [3]:
# Load the config file
with open('../../config/config.json', 'r') as f:
    config = json.load(f)

file_path = config["data_loc"]

 ## Datasets

### Inference batch

In [4]:
# Define file path
file_name = "test_unlabeled.tsv"
final_path = os.path.join("..", file_path, file_name) 

# Load tsv file
inference_batch = pd.read_csv(final_path, sep='\t')
inference_batch['Title_Abstract'] = inference_batch['Title'] + " " + inference_batch['Abstract']
print(f"The inference batch has {inference_batch.shape[0]} observations and {inference_batch.shape[1]} columns.")
inference_batch.head()

The inference batch has 1097 observations and 5 columns.


Unnamed: 0,PMID,Title,Abstract,Label,Title_Abstract
0,34902587,Detection of porcine circovirus type 3 DNA in ...,Porcine circovirus type 3 (PCV3) is regularly ...,0,Detection of porcine circovirus type 3 DNA in ...
1,35451025,Imputation of non-genotyped F1 dams to improve...,This study investigated using imputed genotype...,0,Imputation of non-genotyped F1 dams to improve...
2,34859764,Proposed multidimensional pain outcome methodo...,Castration of male piglets in the United State...,0,Proposed multidimensional pain outcome methodo...
3,35143972,Nanostructured lipid carriers loaded with an a...,Alopecia is a condition associated with differ...,0,Nanostructured lipid carriers loaded with an a...
4,34872491,Genome-wide expression of the residual lung re...,BACKGROUND: Acute or chronic irreversible resp...,0,Genome-wide expression of the residual lung re...


#### Load Pre-Trained Model

In [5]:
# Define pre-trained model path
model_path = "../model_training/checkpoints/checkpoint-8118"

# Load model tokeninzer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model with binary classification head
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,)

In [9]:
os.makedirs("preds", exist_ok=True)

### Inference Data

In [None]:
device = torch.device("cpu")
model = model.to(device)

# Tokenize texts
tokenized_inputs = tokenizer(
    inference_batch['Title_Abstract'].tolist(), 
    truncation=True, 
    padding=True, 
    return_tensors="pt"
)


# Move tokenized inputs to the model's device
device = model.device
tokenized_inputs = {key: value.to(device) for key, value in tokenized_inputs.items()}

#  Run inference
model.eval()  # Set to evaluation mode to disable dropout, etc.
with torch.no_grad():
    outputs = model(**tokenized_inputs)

# Extract predictions for classification
predictions = torch.argmax(outputs.logits, dim=1)
predictions = predictions.cpu().numpy()

# Attach predictions to the DataFrame
inference_batch['Label'] = predictions
print(inference_batch.shape)
inference_batch[['Title_Abstract', 'Label']].head()

In [None]:
inference_batch['Label'].value_counts()

In [None]:
inference_batch[['Title_Abstract', 'Label']].head()

In [None]:
inference_batch[['PMID', 'Label']].to_csv(f"preds/solution_22.csv", index=False)