# Paper flagging

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/david.yang1/.cache/huggingface/'
os.environ['HF_HOME'] = '/home/david.yang1/.cache/huggingface/'

In [2]:
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
import pandas as pd
from datasets import Dataset, DatasetDict
# from huggingface_hub import login
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.tokenize import sent_tokenize
from torch.nn import functional as F
import torch

2024-07-25 09:20:11.870506: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-25 09:20:13.806650: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-25 09:20:13.806686: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-25 09:20:13.815510: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-25 09:20:14.834677: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

## Helper functions

### Load training and validation data

In [3]:
# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings")

# Tokenize the data
def tokenize_function(df):
    return tokenizer(
        df['text'],
        padding="longest",
        truncation=True,
        max_length = 512
    )



In [4]:
def ds_preparation(df, val_count=0):
    # Balance classes if needed
    df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42)
    df = df[["text", "label"]]
    
    # Split dataset into test & train
    df_train = df[val_count:]
    df_val = df[:val_count]
    
    tds = Dataset.from_pandas(df_train)
    vds = Dataset.from_pandas(df_val)

    # Apply the tokenizer to the datasets
    tds = tds.map(tokenize_function, batched=True)
    vds = vds.map(tokenize_function, batched=True)
    
    # Set the format of the datasets to include only the required columns
    tds = tds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    vds = vds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    
    # Define DatasetDict
    ds = DatasetDict({
        "train": tds,
        "validation": vds
    })

    return ds

### Model fine tuning


In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [6]:
# Fine tune model
def fine_tune_model(ds, model_init, train=False):
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy = "steps",
        eval_steps=500,
        num_train_epochs=3,    # number of training epochs
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_ratio=0.01,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Create the Trainer and start training
    trainer = Trainer(
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

    if train:
        trainer.train()

    if ds["validation"]:
        trainer.evaluate()

    return trainer

### Predictions

In [7]:
# Split text into <512 token chunks
def split_text_into_chunks(text, tokenizer, max_tokens=512, overlap_sentences=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for i, sentence in enumerate(sentences):
        # Tokenize the sentence using BERT Tokenizer
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        # Finalize the current chunk if adding this sentence exceed token limit
        if current_chunk_len + token_count > max_tokens:
            text_chunk = " ".join(current_chunk)
            chunks.append(text_chunk)

            # Create the next chunk with overlap
            overlap_start = max(0, i-overlap_sentences)
            current_chunk = []
            for j in range(overlap_start, i):
                current_chunk.append(sentences[j])
            current_chunk_len = len(current_chunk)

        # Add the current sentence tokens to the chunk
        current_chunk.append(sentence)
        current_chunk_len += token_count

    # Add the last chunk if it has content
    if current_chunk:
        text_chunk = " ".join(current_chunk)
        chunks.append(text_chunk)

    return chunks

In [8]:
# Predict label of dataframe
def prediction_chunks(df, tokenizer, trainer):
    output = pd.DataFrame()
    for i, text in enumerate(df["text"]):
        chunks = split_text_into_chunks(text, tokenizer)
        
        chunks_df = pd.DataFrame(chunks, columns=["text"])
        # chunks_df["label"] = df["label"][i]
        chunks_df["position"] = chunks_df.index
        chunks_df["pmid"] = df.loc[i, "pmid"]
        
        t = Dataset.from_pandas(chunks_df)
        t = t.map(tokenize_function, batched=True)
        ds_t = DatasetDict({
            "test": t
        })

        pred = trainer.predict(ds_t["test"])
        chunks_df["prediction"] = pred.predictions.argmax(-1)

        # convert logit score to torch array
        torch_logits = torch.from_numpy(pred.predictions)

        # get probabilities using softmax from logit score and convert it to numpy array
        probabilities_scores = F.softmax(torch_logits, dim = -1).numpy()

        chunks_df["probability"] = probabilities_scores.max(-1)

        # save into output
        output = pd.concat([output, chunks_df], ignore_index=True)
        
    return output, pred

### Dataset to validate chunk prediction results

In [None]:
# # Load prediction chunks
# pred_chunks_0 = pd.read_csv("../data/pipeline_data/paper_flagging_data/0_chunks_labelled.csv")
# pred_chunks_1 = pd.read_csv("../data/pipeline_data/paper_flagging_data/1_chunks_labelled.csv")
# pred_chunks_2 = pd.read_csv("../data/pipeline_data/paper_flagging_data/2_chunks_labelled.csv")
# pred_chunks_3 = pd.read_csv("../data/pipeline_data/paper_flagging_data/3_chunks_labelled.csv")
# pred_chunks_4 = pd.read_csv("../data/pipeline_data/paper_flagging_data/4_chunks_labelled.csv")
# pred_chunks_5 = pd.read_csv("../data/pipeline_data/paper_flagging_data/5_chunks_labelled.csv")
# pred_chunks_6 = pd.read_csv("../data/pipeline_data/paper_flagging_data/6_chunks_labelled.csv")
# pred_chunks_7 = pd.read_csv("../data/pipeline_data/paper_flagging_data/7_chunks_labelled.csv")
# pred_chunks_8 = pd.read_csv("../data/pipeline_data/paper_flagging_data/8_chunks_labelled.csv")
# pred_chunks_9 = pd.read_csv("../data/pipeline_data/paper_flagging_data/9_chunks_labelled.csv")
# pred_chunks_10 = pd.read_csv("../data/pipeline_data/paper_flagging_data/10_chunks_labelled.csv")
# pred_chunks_11 = pd.read_csv("../data/pipeline_data/paper_flagging_data/11_chunks_labelled.csv")


# # Concatenate data
# df_test = pd.concat([pred_chunks_0, pred_chunks_1, pred_chunks_2, pred_chunks_3, pred_chunks_4, pred_chunks_5, pred_chunks_6, pred_chunks_7, 
#                      pred_chunks_8, pred_chunks_9, pred_chunks_10, pred_chunks_11])

# # Load dataframe as dataset
# test = Dataset.from_pandas(df_test)

# # Tokenize test dataset
# test = test.map(tokenize_function, batched=True)

# # Set the format of the datasets to include only the required columns
# test = test.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# # Define DatasetDict
# ds_test = DatasetDict({
#     "test": test
# })

In [9]:
def validate_model(trainer):
    # Load prediction chunks
    pred_chunks_0 = pd.read_csv("../data/pipeline_data/paper_flagging_data/0_chunks_labelled.csv")
    pred_chunks_1 = pd.read_csv("../data/pipeline_data/paper_flagging_data/1_chunks_labelled.csv")
    pred_chunks_2 = pd.read_csv("../data/pipeline_data/paper_flagging_data/2_chunks_labelled.csv")
    pred_chunks_3 = pd.read_csv("../data/pipeline_data/paper_flagging_data/3_chunks_labelled.csv")
    pred_chunks_4 = pd.read_csv("../data/pipeline_data/paper_flagging_data/4_chunks_labelled.csv")
    pred_chunks_5 = pd.read_csv("../data/pipeline_data/paper_flagging_data/5_chunks_labelled.csv")
    pred_chunks_6 = pd.read_csv("../data/pipeline_data/paper_flagging_data/6_chunks_labelled.csv")
    pred_chunks_7 = pd.read_csv("../data/pipeline_data/paper_flagging_data/7_chunks_labelled.csv")
    pred_chunks_8 = pd.read_csv("../data/pipeline_data/paper_flagging_data/8_chunks_labelled.csv")
    pred_chunks_9 = pd.read_csv("../data/pipeline_data/paper_flagging_data/9_chunks_labelled.csv")
    pred_chunks_10 = pd.read_csv("../data/pipeline_data/paper_flagging_data/10_chunks_labelled.csv")
    pred_chunks_11 = pd.read_csv("../data/pipeline_data/paper_flagging_data/11_chunks_labelled.csv")
    
    
    # Concatenate data
    df_test = pd.concat([pred_chunks_0, pred_chunks_1, pred_chunks_2, pred_chunks_3, pred_chunks_4, pred_chunks_5, pred_chunks_6, pred_chunks_7, 
                         pred_chunks_8, pred_chunks_9, pred_chunks_10, pred_chunks_11])
    
    # Load dataframe as dataset
    test = Dataset.from_pandas(df_test)
    
    # Tokenize test dataset
    test = test.map(tokenize_function, batched=True)
    
    # Set the format of the datasets to include only the required columns
    test = test.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    
    # Define DatasetDict
    ds_test = DatasetDict({
        "test": test
    })
    
    # Test performance of the model on labeled chunks
    pred = trainer.predict(ds_test["test"])
    
    df_test["prediction"] = pred.predictions.argmax(-1)
    
    metrics = compute_metrics(pred)

    return metrics    

## Chunk labelling

In [11]:
# Load data
df = pd.read_csv('../data/pipeline_data/paper_flagging_data/bert_dataset.csv')
ds = ds_preparation(df, val_count=128)

# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../checkpoints/chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

# Check performance
metrics = validate_model(trainer)
print(metrics)

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/125 [00:00<?, ? examples/s]

{'accuracy': 0.68, 'f1': 0.6875000000000001, 'precision': 0.5301204819277109, 'recall': 0.9777777777777777}


In [13]:
# Load new papers
# data = pd.read_csv("../data/pipeline_data/paper_flagging_data/new_papers_dataset.csv")
data = pd.read_csv("../data/pipeline_data/test.csv")
chunked_data_df, chunked_data_pred = prediction_chunks(data[:10], tokenizer, trainer)

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [16]:
chunked_data_df

Unnamed: 0,text,position,pmid,prediction,probability
0,Rapid detection and tracking of Omicron varian...,0,35189535,1,0.959968
1,The new features of Omicron manifested the imp...,1,35189535,1,0.944278
2,Plasmid construction and virus RNA of SARS-CoV...,2,35189535,1,0.993587
3,"Furthermore, five oropharyngeal swab specimens...",3,35189535,1,0.938198
4,Workflow of RT-PCR/CRISPR-Cas12a-mediated assa...,4,35189535,1,0.976641
...,...,...,...,...,...
195,Immune dysregulation and autoreactivity correl...,40,35214706,0,0.994895
196,Optimizing immunization strategies for the ind...,41,35214706,0,0.877917
197,Vaccine Breakthrough Infections with SARS-CoV-...,42,35214706,1,0.992595
198,Photodynamic Vaccination of BALB/c Mice for Pr...,43,35214706,0,0.950951


## Paper classification

In [17]:
import lightgbm as lgb

In [23]:
def paper_prediction(data, bst, tokenizer, trainer):
    chunked_data_df, chunked_data_pred = prediction_chunks(data, tokenizer, trainer)
    
    # Load lightbgm model
    # bst = lgb.Booster(model_file)

    # Format lightGBM data 
    data = chunked_data_df
    
    # Format data for lightGBM
    grouped = data.groupby('pmid')
    
    # Maximum number of data points in any group
    max_len = 133

    # Create DataFrame with appropriate number of columns
    columns = [f'prediction_{i}' for i in range(max_len)]
    columns.append("pmid")
    
    df = pd.DataFrame(columns=columns)
    
    for name, group in grouped:
        predictions = group["prediction"].values.astype(float)
        entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
        entry = np.append(entry, name)
        df.loc[name] = entry

    predictions = bst.predict(df.drop(columns="pmid"), num_iteration=bst.best_iteration)
    pred = np.where(predictions < 0.5, 0, 1)
    df["prediction"] = pred.T

    df['pmid'] = df['pmid'].astype(int)
    flagged_papers = df[df['prediction'] == 1]['pmid']
    
    relevant_papers = flagged_papers.tolist()
    flagged = chunked_data_df[chunked_data_df["pmid"].isin(relevant_papers)]

    return flagged

In [24]:
# data = pd.read_csv("../data/pipeline_data/paper_flagging_data/new_papers_dataset.csv")
data = pd.read_csv("../data/pipeline_data/test.csv")

# Load data
df = pd.read_csv('../data/pipeline_data/paper_flagging_data/bert_dataset.csv')
ds = ds_preparation(df, val_count=128)

# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../checkpoints/chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

results = paper_prediction(data[:10], bst, tokenizer, trainer)

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

In [25]:
results

Unnamed: 0,text,position,pmid,prediction,probability
0,Rapid detection and tracking of Omicron varian...,0,35189535,1,0.959968
1,The new features of Omicron manifested the imp...,1,35189535,1,0.944278
2,Plasmid construction and virus RNA of SARS-CoV...,2,35189535,1,0.993587
3,"Furthermore, five oropharyngeal swab specimens...",3,35189535,1,0.938198
4,Workflow of RT-PCR/CRISPR-Cas12a-mediated assa...,4,35189535,1,0.976641
5,"Finally, the fluorescence signal was measured ...",5,35189535,1,0.986942
6,"In all panels, error bars represent the mean +...",6,35189535,1,0.987321
7,2B-E). To estimate the detection limit of our ...,7,35189535,1,0.985151
8,"Furthermore, a relatively lower signal was obs...",8,35189535,1,0.985241
9,"In our study, Omicron and wild-type targets we...",9,35189535,1,0.891522


In [19]:
# Load lightbgm model
bst = lgb.Booster(model_file='../checkpoints/lightgbm_model.txt')

In [None]:
# Format lightGBM data 
data = chunked_data_df

# Format data for lightGBM
grouped = data.groupby('paper')

# Maximum number of data points in any group
max_len = 133
# max_len = max(grouped.size())
# print(max_len)

# Create DataFrame with appropriate number of columns
columns = [f'prediction_{i}' for i in range(max_len)]
columns.append("paper")

df = pd.DataFrame(columns=columns)

for name, group in grouped:
    predictions = group["prediction"].values.astype(float)
    entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
    # entry = np.pad(predictions, (0, 133 - len(predictions)), constant_values=np.nan)
    entry = np.append(entry, name)
    df.loc[name] = entry

In [None]:
predictions = bst.predict(df.drop(columns="paper"), num_iteration=bst.best_iteration)
pred = np.where(predictions < 0.5, 0, 1)
df["prediction"] = pred.T

In [None]:
df['paper'] = df['paper'].astype(int)
flagged_papers = df[df['prediction'] == 1]['paper']

relevant_papers = flagged_papers.tolist()
flagged = chunked_data_df[chunked_data_df["paper"].isin(relevant_papers)]

In [None]:
print(flagged["text"][0])

In [None]:
print(flagged["text"][0])

# NER with BERN2

In [26]:
import requests
import pickle

In [27]:
flagged = results

In [28]:
def query_plain(text, url="http://localhost:8888/plain"):
    return requests.post(url, json={'text': text}).json()

# port = "http://172.19.8.251:8888/plain"
port = "http://172.19.4.131:8888/plain"

In [30]:
grouped_papers = flagged.groupby('pmid')

for name, group in grouped_papers:
    NER_list = list()
    for text in group["text"]:
        NER = query_plain(text, url = port)
        NER_list.append(NER)
    file_name = "../data/pipeline_data/NER/" + str(name) + "_paper.pkl"
    with open(file_name, 'wb') as f:
        pickle.dump(NER_list, f)

# Figure out what each mutation does
https://www.reddit.com/r/MachineLearning/comments/o0kixr/improving_bart_text_summarization_by_providing/

https://peterbloem.nl/blog/transformers

# Description of mutations

In [1]:
from pathlib import Path
import pickle
import spacy
from collections import defaultdict

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
files = Path("/home/david.yang1/autolit/viriation/data/pipeline_data/NER").glob("*.pkl")

In [7]:
def load_mutations(path = "/home/david.yang1/autolit/viriation/data/pipeline_data/NER"):
    files = Path(path).glob("*.pkl")
    for file in files:
        # Initialize output dictionary
        output = defaultdict(lambda:{"doi": [], "text": []})
        
        with open(file, 'rb') as f:
            # Load NERs
            ner = pickle.load(f) 

            for ner_chunk in ner:
                text = ner_chunk['text']

                # Process text chunks for increased readability
                chunk = nlp(text)

                # Split text into sentences
                sentences = [sent.text for sent in chunk.sents]
                annotations = ner_chunk['annotations']
                
                for annotation in annotations:
                    if annotation['obj'] == 'mutation':
                        mutation = annotation["mention"]
                        for count, sent in enumerate(sentences):
                            if mutation in sent:
                                context = []

                                # Save sentence before and after mutation
                                if count != 0:
                                    context.append(sentences[count-1])
    
                                context.append(sent)

                                if count != (len(sentences)-1):
                                    context.append(sentences[count+1])
    
                                context = " ".join(context)
    
                                output[mutation]["text"].append(context) 
        
    return output

In [8]:
res = load_mutations()

In [15]:
res["K417N"]["text"][0]

'The newly emerged fifth variant of concern (VOC) Omicron was firstly reported in South Africa on November 24, 2021 and has been detected in many countries. Omicron variant contains more than 32 amino acid mutations in the spike protein, including multiple vital amino acid mutations (K417N, T478K, E484A, N501Y, and D614G) that have been already detected in other VOCs of SARS-CoV-2 and proved to be associated with enhanced transmissibility, virulence, and greater resistance to the immune protection induced by COVID-19 vaccines. The new features of Omicron manifested the importance of tracking its spread.'