In [None]:
!pip install llm-rs --quiet
!pip install langchain --quiet
!pip install llm-rs[langchain] --quiet
!pip install datasets
!pip install peft --upgrade --quiet
!pip install torchmetrics
!pip install evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/7.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m4.4/7.9 MB[0m [31m63.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m77.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.9/7.9 MB[0m [31m77.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/376.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m376.2/376.2 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hColl

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import torch
from torch import nn
import numpy as np
import re
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, LlamaForSequenceClassification
from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchmetrics import Accuracy

In [None]:
import pandas as pd

def process_dataset(dataset_file_path):
    """
    Function to process the dataset, filter valid abstracts, and calculate average length.

    Args:
        dataset_file_path (str): The file path for the dataset.

    Returns:
        all_input (list): List of valid abstracts from the dataset.
        avg_length (float): The average length of the abstracts in the list.
    """
    # Read dataset
    try:
        df = pd.read_csv(dataset_file_path) if dataset_file_path.endswith('.csv') else pd.read_json(dataset_file_path, orient='records', lines=True)
    except Exception as e:
        print(f"Error reading file {dataset_file_path}: {e}")
        return [], 0

    # Extract abstracts from dataset
    all_input = []
    if 'abstract' in df.columns:
        all_input = df['abstract'].tolist()
    elif 'Abstract' in df.columns:
        all_input = df['Abstract'].tolist()

    print(f"Total abstracts: {len(all_input)}")

    # Filter out non-string elements
    all_input = [x for x in all_input if isinstance(x, str)]

    # Calculate average length of abstracts
    avg_length = sum(map(len, all_input))/float(len(all_input)) if all_input else 0

    print(f"The average length of abstracts is: {avg_length}")
    print(f"Valid abstracts count: {len(all_input)}")

    return all_input, avg_length

# Example usage:
arxiv_file_path = "/content/arXiv_2023.csv"
dblp_file_path = "C:/Users/romeo/ECAI'25/datasets/DBLP_2023.csv"
elsevier_file_path = "C:/Users/romeo/ECAI'25/datasets/elsevier.json"
pubmed_file_path = "C:/Users/romeo/ECAI'25/datasets/pubmed_2023.csv"
all_input, avg_length = process_dataset(arxiv_file_path)


Total abstracts: 198343
The average length of abstracts is: 1101.7295291489995
Valid abstracts count: 198343


In [None]:
all_input = all_input[:35000] # slicing data for arXiv, DBLP, and PubMed
print(len(all_input))

35000


In [None]:
import nltk
import re
from textblob import Word
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from string import punctuation # Import for singularization

# Ensure necessary NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')


lm = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))
remove_limit = 5


def clean_str(string):
    """
    Clean and normalize a given string by removing unwanted characters,
    normalizing contractions, removing stopwords, abbreviations, and applying lemmatization and singularization.
    """
    if not isinstance(string, str):
        return ""
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    custom_stop_words = {"paper", "research", "study", "approach", "propose"}  # Custom words to remove
    abbreviations = {"e.g.": "for example", "i.e.": "that is", "etc.": "and so on", "vs.": "versus", "approx.": "approximately"}  # Abbreviations to remove

    # Replace abbreviations
    for abbr, replacement in abbreviations.items():
        string = string.replace(abbr, replacement)
    # Remove URLs
    string = re.sub(r'http\S+|www\S+', '', string)
    # Remove digits
    string = re.sub(r'[0-9]+', '', string)
    # Remove hyphens and replace with space
    string = re.sub(r'-', ' ', string)
    # Remove punctuation
    string = re.sub(r'[^A-Za-z\s]', '', string)
    # Normalize contractions
    string = re.sub(r"\'s", " is", string)
    string = re.sub(r"\'ve", " have", string)
    string = re.sub(r"n\'t", " not", string)
    string = re.sub(r"\'re", " are", string)
    string = re.sub(r"\'d", " would", string)
    string = re.sub(r"\'ll", " will", string)
    # Remove extra spaces
    string = re.sub(r'\s{2,}', ' ', string)
    # Tokenize text
    words = nltk.word_tokenize(string)
    # Remove stopwords and punctuation
    words = [word.lower() for word in words if word.lower() not in stop_words and word.lower() not in custom_stop_words and word.lower() not in punctuation]
    # Part-of-Speech tagging and filtering nouns
    tagged_list = nltk.pos_tag(words)
    nouns_list = [t[0] for t in tagged_list if t[-1] == 'NN']
    # Singularize and lemmatize words
    words = [Word(word).singularize() for word in nouns_list]
    words = [lemmatizer.lemmatize(word, pos='n') for word in words]

    return " ".join(words).strip().lower()


# print()
# print("-" * 70)
# all_abstracts = []
# for abstract in all_input:
#     cleaned_abstract = clean_str(abstract)
#     all_abstracts.append(cleaned_abstract)

# all_input = all_abstracts
# print(len(all_input))

print()
print("-" * 70)
all_abstracts = []
total_word_count = 0
for abstract in all_input:
    cleaned_abstract = clean_str(abstract)
    all_abstracts.append(cleaned_abstract)
    # Count total words in cleaned abstract
    total_word_count += len(cleaned_abstract.split())

all_input = all_abstracts
print(f"Total number of words in all abstracts: {total_word_count}")
print(f"Number of valid abstracts: {len(all_input)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt


----------------------------------------------------------------------
Total number of words in all abstracts: 1217714
Number of valid abstracts: 35000


In [None]:
arxiv_top_words = ['model', 'system', 'state', 'method', 'field', 'energy', 'space', 'information', 'network',
                   'work', 'analysis', 'quantum', 'function', 'structure', 'framework', 'graph', 'distribution', 'image', 'control']

dblp_top_words = ['model', 'network', 'system', 'information', 'algorithm', 'analysis', 'image', 'control',  'framework', 'graph', 'detection',
                  'accuracy', 'art','optimization', 'design', 'function', 'classification', 'task', 'formula', 'space']

elseveir_top_words = ["analysis", "model", "energy", "cell", "water", "system", "development", "effect", "treatment",
                      "health", "activity", "risk", "group", "age", "information", "response", "production", "expression", "disease", "surface"]


pubmed_top_words= ["treatment", "cell", "cancer", "analysis", "health", "disease", "model", "effect", "development", "expression",
                   "activity","system", "role", "method", "response","gene", "protein", "tumor","care","growth"]


top_n_words = [arxiv_top_words, dblp_top_words, elseveir_top_words, pubmed_top_words]
top_n_words_index = 0


In [None]:
from sklearn.model_selection import train_test_split

def generate_and_split_labels(all_input, top_words, top_k=0, test_size=0.2, random_state=42):

    doc_labels = []

    for doc in all_input:
        label = [1 if term in doc else 0 for term in top_words[:top_k]]
        doc_labels.append(label)

    train_labels, test_labels = train_test_split(doc_labels, test_size=test_size, random_state=random_state)

    print(f"Total documents: {len(doc_labels)}")
    print(f"Train labels: {len(train_labels)}, Test labels: {len(test_labels)}")
    print("Sample labels:", doc_labels[:3])

    return train_labels, test_labels, doc_labels

top_five = 5 # or any other value you'd like
top_ten = 10
top_fifteen = 15

train_labels_5, test_labels_5, doc_labels_5 = generate_and_split_labels(all_abstracts, top_n_words[top_n_words_index], top_k=top_five)
train_labels_10, test_labels_10, doc_labels_10 = generate_and_split_labels(all_abstracts, top_n_words[top_n_words_index], top_k=top_ten)
train_labels_15, test_labels_15, doc_labels = generate_and_split_labels(all_input, top_n_words[top_n_words_index], top_k=top_fifteen)

Total documents: 35000
Train labels: 28000, Test labels: 7000
Sample labels: [[0, 0, 1, 0, 0], [0, 0, 0, 0, 0], [1, 1, 0, 0, 0]]
Total documents: 35000
Train labels: 28000, Test labels: 7000
Sample labels: [[0, 0, 1, 0, 0, 0, 1, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
Total documents: 35000
Train labels: 28000, Test labels: 7000
Sample labels: [[0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]


In [None]:
import random

random.seed(42)
random.shuffle(all_abstracts)
#random.shuffle(all_keywords)


train_size = int(0.8 * len(all_abstracts))
train_input = all_abstracts[:train_size]
test_input = all_abstracts[train_size:]

print(len(train_input))
print(len(test_input))

28000
7000


In [None]:
model_path = "openlm-research/open_llama_3b"  # Replace with the correct model identifier
num_labels = 5

model = LlamaForSequenceClassification.from_pretrained(model_path, num_labels=num_labels, torch_dtype=torch.float16)#, device_map='cuda:0')

# tokenizer = AutoTokenizer.from_pretrained(
#     model_path,
#     use_fast=False,
#     trust_remote_code=True,
#     padding_side="left",
#     pad_token="<|endoftext|>"
# )

# model.resize_token_embeddings(len(tokenizer))  # https://github.com/huggingface/transformers/issues/1805

# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1


Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at openlm-research/open_llama_3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/593 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/534k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/330 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

In [None]:
#Import the functions from peft again:
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"], # https://github.com/huggingface/peft/blob/632997d1fb776c3cf05d8c2537ac9a98a7ce9435/src/peft/utils/other.py#L202
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print(lora_model)

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 3200, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=3200, out_features=3200, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3200, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3200, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_pr

In [None]:
# create tokenize function
def tokenize_function(examples):
    # extract text
    text = examples["sentence"]

    #tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )

    return tokenized_inputs

In [None]:
from datasets import Dataset

# Convert all_input list to a datasets.Dataset object
tokenized_dataset = Dataset.from_dict({"sentence": all_abstracts}).map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Dataset({
    features: ['sentence', 'input_ids', 'attention_mask'],
    num_rows: 35000
})

In [None]:

import evaluate  # Add this line to import the evaluate module

# import accuracy evaluation metric
accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

# define an evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels),
            "f1" : f1_score.compute(predictions=predictions, references=labels, average='weighted')}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
model = get_peft_model(model, config) # Replace peft_config with config
model.print_trainable_parameters()

trainable params: 5,324,800 || all params: 3,329,414,400 || trainable%: 0.1599


In [None]:
import torch
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Assuming tokenized_dataset, train_labels_5, test_labels_5, model, tokenizer, and compute_metrics are already defined

# # Split the dataset into train and validation sets
train_testvalid = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

# Add the labels to the train and validation datasets
train_labels_5_single = [sublist.index(1) if 1 in sublist else 0 for sublist in train_labels_5]
test_labels_5_single = [sublist.index(1) if 1 in sublist else 0 for sublist in test_labels_5]
print(train_labels_5_single[:5])
print(test_labels_5_single[:5])


train_testvalid["train"] = train_testvalid["train"].add_column("labels", train_labels_5_single)
train_testvalid["test"] = train_testvalid["test"].add_column("labels", test_labels_5_single)


# Define the compute_metrics function to evaluate performance
def compute_metrics(p):
    predictions, labels = p
    # Apply sigmoid activation to get probabilities
    probs = torch.sigmoid(predictions)
    # Threshold probabilities to get predicted labels (0 or 1)
    preds = (probs > 0.5).int()
    # Calculate accuracy and F1 score using the predicted and true labels
    acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())
    f1 = f1_score(labels.cpu().numpy(), preds.cpu().numpy(), average="weighted")
    return {"accuracy": acc, "f1": f1}


# Define the training arguments
training_args = TrainingArguments(
    output_dir="openlm-research/open_llama_3b" + "-lora-rpc-classification",  # Directory to store results
    eval_strategy="epoch",  # Evaluate after each epoch
    logging_dir="./logs",  # Directory to store logs
    logging_steps=10,  # Print results every 10 steps
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=16,  # Batch size for evaluation
    num_train_epochs=3,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    save_strategy="epoch",  # Save the model after each epoch
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="accuracy" #Specify the metric
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_testvalid["train"], # Use 'train' split from train_testvalid
    eval_dataset=train_testvalid["test"], # Use 'test' split from train_testvalid
    tokenizer=tokenizer,
    data_collator=data_collator, # Pass the data_collator to the Trainer
    compute_metrics=compute_metrics,
)

# Replace compute_loss function in Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): # Add num_items_in_batch
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Use CrossEntropyLoss for multi-class classification
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels) # No need to convert labels to float
        return (loss, outputs) if return_outputs else loss


# Initialize the CustomTrainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_testvalid["train"], # Use 'train' split from train_testvalid
    eval_dataset=train_testvalid["test"], # Use 'test' split from train_testvalid
    tokenizer=tokenizer,
    data_collator=data_collator, # Pass the data_collator to the Trainer
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Initialize lists to store accuracy and F1 scores
acc_scores = []
f1_scores = []

# Define a custom callback function to calculate the std during training
def compute_std(epoch_results, metric_name):
    """Compute standard deviation for accuracy or F1 score."""
    return np.std(epoch_results[metric_name])

# After training, collect evaluation results
eval_results = trainer.evaluate()

# Print the results for accuracy and F1
print("Evaluation results after training:")
print(eval_results)

# Assuming results contain accuracy and f1 for each epoch, calculate std
acc_std = compute_std(acc_scores, 'accuracy')
f1_std = compute_std(f1_scores, 'f1')

print(f"Standard Deviation of Accuracy: {acc_std}")
print(f"Standard Deviation of F1 Score: {f1_std}")

[0, 0, 3, 0, 0]
[0, 0, 1, 2, 2]


Flattening the indices:   0%|          | 0/28000 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/7000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


In [None]:
API_key = "20a6dd179a6d8e6edfbac3fe7a6ce7001c0b8c96" #(RPC-2 model)