# Paper flagging

In [46]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/david.yang1/.cache/huggingface/'
os.environ['HF_HOME'] = '/home/david.yang1/.cache/huggingface/'

In [47]:
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
import pandas as pd
from datasets import Dataset, DatasetDict
# from huggingface_hub import login
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.tokenize import sent_tokenize
from torch.nn import functional as F
import torch

## Helper functions

### Load training and validation data

In [48]:
# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings")

# Tokenize the data
def tokenize_function(df):
    return tokenizer(
        df['text'],
        padding="longest",
        truncation=True,
        max_length = 512
    )



In [49]:
def ds_preparation(df, val_count=0):
    # Balance classes if needed
    df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42)
    df = df[["text", "label"]]
    
    # Split dataset into test & train
    df_train = df[val_count:]
    df_val = df[:val_count]
    
    tds = Dataset.from_pandas(df_train)
    vds = Dataset.from_pandas(df_val)

    # Apply the tokenizer to the datasets
    tds = tds.map(tokenize_function, batched=True)
    vds = vds.map(tokenize_function, batched=True)
    
    # Set the format of the datasets to include only the required columns
    tds = tds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    vds = vds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    
    # Define DatasetDict
    ds = DatasetDict({
        "train": tds,
        "validation": vds
    })

    return ds

### Model fine tuning


In [50]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [51]:
# Fine tune model
def fine_tune_model(ds, model_init, train=False):
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy = "steps",
        eval_steps=500,
        num_train_epochs=3,    # number of training epochs
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_ratio=0.01,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Create the Trainer and start training
    trainer = Trainer(
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

    if train:
        trainer.train()

    if ds["validation"]:
        trainer.evaluate()

    return trainer

### Predictions

In [52]:
# Split text into <512 token chunks
def split_text_into_chunks(text, tokenizer, max_tokens=512, overlap_sentences=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for i, sentence in enumerate(sentences):
        # Tokenize the sentence using BERT Tokenizer
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        # Finalize the current chunk if adding this sentence exceed token limit
        if current_chunk_len + token_count > max_tokens:
            text_chunk = " ".join(current_chunk)
            chunks.append(text_chunk)

            # Create the next chunk with overlap
            overlap_start = max(0, i-overlap_sentences)
            current_chunk = []
            for j in range(overlap_start, i):
                current_chunk.append(sentences[j])
            current_chunk_len = len(current_chunk)

        # Add the current sentence tokens to the chunk
        current_chunk.append(sentence)
        current_chunk_len += token_count

    # Add the last chunk if it has content
    if current_chunk:
        text_chunk = " ".join(current_chunk)
        chunks.append(text_chunk)

    return chunks

In [53]:
# Predict label of dataframe
def prediction_chunks(df, tokenizer, trainer):
    output = pd.DataFrame()
    for i, text in enumerate(df["text"]):
        chunks = split_text_into_chunks(text, tokenizer)
        
        chunks_df = pd.DataFrame(chunks, columns=["text"])
        # chunks_df["label"] = df["label"][i]
        chunks_df["position"] = chunks_df.index
        chunks_df["paper"] = i
        
        t = Dataset.from_pandas(chunks_df)
        t = t.map(tokenize_function, batched=True)
        ds_t = DatasetDict({
            "test": t
        })

        pred = trainer.predict(ds_t["test"])
        chunks_df["prediction"] = pred.predictions.argmax(-1)

        # convert logit score to torch array
        torch_logits = torch.from_numpy(pred.predictions)

        # get probabilities using softmax from logit score and convert it to numpy array
        probabilities_scores = F.softmax(torch_logits, dim = -1).numpy()

        chunks_df["probability"] = probabilities_scores.max(-1)

        # save into output
        output = pd.concat([output, chunks_df], ignore_index=True)
        
    return output, pred

### Dataset to validate chunk prediction results

In [54]:
# Load prediction chunks
pred_chunks_0 = pd.read_csv("../data/pipeline_data/paper_flagging_data/0_chunks_labelled.csv")
pred_chunks_1 = pd.read_csv("../data/pipeline_data/paper_flagging_data/1_chunks_labelled.csv")
pred_chunks_2 = pd.read_csv("../data/pipeline_data/paper_flagging_data/2_chunks_labelled.csv")
pred_chunks_3 = pd.read_csv("../data/pipeline_data/paper_flagging_data/3_chunks_labelled.csv")
pred_chunks_4 = pd.read_csv("../data/pipeline_data/paper_flagging_data/4_chunks_labelled.csv")
pred_chunks_5 = pd.read_csv("../data/pipeline_data/paper_flagging_data/5_chunks_labelled.csv")
pred_chunks_6 = pd.read_csv("../data/pipeline_data/paper_flagging_data/6_chunks_labelled.csv")
pred_chunks_7 = pd.read_csv("../data/pipeline_data/paper_flagging_data/7_chunks_labelled.csv")
pred_chunks_8 = pd.read_csv("../data/pipeline_data/paper_flagging_data/8_chunks_labelled.csv")
pred_chunks_9 = pd.read_csv("../data/pipeline_data/paper_flagging_data/9_chunks_labelled.csv")
pred_chunks_10 = pd.read_csv("../data/pipeline_data/paper_flagging_data/10_chunks_labelled.csv")
pred_chunks_11 = pd.read_csv("../data/pipeline_data/paper_flagging_data/11_chunks_labelled.csv")


# Concatenate data
df_test = pd.concat([pred_chunks_0, pred_chunks_1, pred_chunks_2, pred_chunks_3, pred_chunks_4, pred_chunks_5, pred_chunks_6, pred_chunks_7, 
                     pred_chunks_8, pred_chunks_9, pred_chunks_10, pred_chunks_11])

# Load dataframe as dataset
test = Dataset.from_pandas(df_test)

# Tokenize test dataset
test = test.map(tokenize_function, batched=True)

# Set the format of the datasets to include only the required columns
test = test.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# Define DatasetDict
ds_test = DatasetDict({
    "test": test
})

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [55]:
def validate_model(trainer):
    # Test performance of the model on labeled chunks
    pred = trainer.predict(ds_test["test"])
    
    df_test["prediction"] = pred.predictions.argmax(-1)
    
    metrics = compute_metrics(pred)

    # df_test.to_csv("df_test_chunks.csv")

    return metrics    

## Chunk labelling

In [56]:
# Load data
df = pd.read_csv('../data/pipeline_data/paper_flagging_data/bert_dataset.csv')
ds = ds_preparation(df, val_count=128)

# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

# Check performance
metrics = validate_model(trainer)
print(metrics)

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


{'accuracy': 0.68, 'f1': 0.6875000000000001, 'precision': 0.5301204819277109, 'recall': 0.9777777777777777}


In [57]:
# Load new papers
data = pd.read_csv("../data/pipeline_data/paper_flagging_data/new_papers_dataset.csv")
chunked_data_df, chunked_data_pred = prediction_chunks(data[:10], tokenizer, trainer)

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/19 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/45 [00:00<?, ? examples/s]

## Paper classification

In [58]:
import lightgbm as lgb

In [59]:
# Load lightbgm model
bst = lgb.Booster(model_file='../models/lightgbm_model.txt')

In [60]:
# Format lightGBM data 
data = chunked_data_df

# Format data for lightGBM
grouped = data.groupby('paper')

# Maximum number of data points in any group
max_len = 133
# max_len = max(grouped.size())
# print(max_len)

# Create DataFrame with appropriate number of columns
columns = [f'prediction_{i}' for i in range(max_len)]
columns.append("paper")

df = pd.DataFrame(columns=columns)

for name, group in grouped:
    predictions = group["prediction"].values.astype(float)
    entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
    # entry = np.pad(predictions, (0, 133 - len(predictions)), constant_values=np.nan)
    entry = np.append(entry, name)
    df.loc[name] = entry

In [61]:
predictions = bst.predict(df.drop(columns="paper"), num_iteration=bst.best_iteration)
pred = np.where(predictions < 0.5, 0, 1)
df["prediction"] = pred.T

In [62]:
df['paper'] = df['paper'].astype(int)
flagged_papers = df[df['prediction'] == 1]['paper']

relevant_papers = flagged_papers.tolist()
flagged = chunked_data_df[chunked_data_df["paper"].isin(relevant_papers)]

In [63]:
print(flagged["text"][0])

Rapid detection and tracking of Omicron variant of SARS-CoV-2 using CRISPR-Cas12a-based assayBackgroundThe newly emerged SARS-CoV-2 variant of concern (VOC) Omicron is spreading quickly worldwide, which manifests an urgent need of simple and rapid assay to detect and diagnose Omicron infection and track its spread.MethodsTo design allele-specific CRISPR RNAs (crRNAs) targeting the signature mutations in the spike protein of Omicron variant, and to develop a CRISPR-Cas12a-based assay to specifically detect Omicron variant.ResultsOur system showed a low limit of detection of 2 copies per reaction for the plasmid DNA of Omicron variant, and could readily detect Omicron variant in 5 laboratory-confirmed clinical samples and distinguish them from 57 SARS-CoV-2 positive clinical samples (4 virus isolates and 53 oropharyngeal swab specimens) infected with wild-type (N = 8) and the variants of Alpha (N = 17), Beta (N = 17) and Delta (N = 15). The testing results could be measured by fluorescen

In [44]:
print(flagged["text"][0])

Rapid detection and tracking of Omicron variant of SARS-CoV-2 using CRISPR-Cas12a-based assayBackgroundThe newly emerged SARS-CoV-2 variant of concern (VOC) Omicron is spreading quickly worldwide, which manifests an urgent need of simple and rapid assay to detect and diagnose Omicron infection and track its spread.MethodsTo design allele-specific CRISPR RNAs (crRNAs) targeting the signature mutations in the spike protein of Omicron variant, and to develop a CRISPR-Cas12a-based assay to specifically detect Omicron variant.ResultsOur system showed a low limit of detection of 2 copies per reaction for the plasmid DNA of Omicron variant, and could readily detect Omicron variant in 5 laboratory-confirmed clinical samples and distinguish them from 57 SARS-CoV-2 positive clinical samples (4 virus isolates and 53 oropharyngeal swab specimens) infected with wild-type (N = 8) and the variants of Alpha (N = 17), Beta (N = 17) and Delta (N = 15).The testing results could be measured by fluorescent

# NER with BERN2

In [38]:
import requests
import pickle

In [41]:
def query_plain(text, url="http://localhost:8888/plain"):
    return requests.post(url, json={'text': text}).json()

port = "http://172.19.8.251:8888/plain"

In [42]:
grouped_papers = flagged.groupby('paper')

for name, group in grouped_papers:
    NER_list = list()
    for text in group["text"]:
        NER = query_plain(text, url = port)
        NER_list.append(NER)
    file_name = "../data/pipeline_data/NER/" + str(name) + "_paper.pkl"
    with open(file_name, 'wb') as f:
        pickle.dump(NER_list, f)

# Figure out what each mutation does
https://www.reddit.com/r/MachineLearning/comments/o0kixr/improving_bart_text_summarization_by_providing/

https://peterbloem.nl/blog/transformers

# Description of mutations

In [65]:
from pathlib import Path
import pickle
import spacy

In [66]:
nlp = spacy.load("en_core_web_sm")

In [None]:
files = Path("/home/david.yang1/autolit/viriation/data/pipeline_data/NER").glob("*.pkl")

In [None]:
for file in files:
    with open(file, 'rb') as f:
        ner = pickle.load(f) # deserialize using load()
        for ner_chunk in ner:
            text = ner_chunk['text']
            # sentences = nltk.sent_tokenize(text)
            chunk = nlp(text)
            print(chunk)
            sentences = [sent.text for sent in chunk.sents]
            # print(sentences)
            print(len(sentences))
            annotations = ner_chunk['annotations']
            for annotation in annotations:
                if annotation['obj'] == 'mutation':
                    print(annotation["mention"])
                    for sent in sentences:
                        if annotation['mention'] in sent:
                            print(sent)
                            print("")