In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


2023-08-28 20:20:01.253250: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-28 20:20:01.338791: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_df = pd.read_pickle("../data/train.df")
test_df = pd.read_pickle("../data/test.df")
val_df = pd.read_pickle("../data/val.df")

In [3]:
train_df

Unnamed: 0,ID,Project ID,Project Name,Title,Description,BioProjectID,genotype,index_dict
1591,755255,PRJNA755255,Epigenetic Treatment of Behavioral and Physiol...,Epigenetic Treatment of Behavioral and Physiol...,We performed RNAseq experiments to examine gen...,755255,"[PS19(P301S), (C57BL/6 x C3H)]","{'PS19(P301S)': [(197, 208)], '(C57BL/6 x C3H)..."
906,831177,PRJNA831177,IDENTIFICATION AND FUNCTIONAL DEMONSTRATION OF...,IDENTIFICATION AND FUNCTIONAL DEMONSTRATION OF...,LIN28B suppresses MLL-ENL driven AML. The tumo...,831177,"[WT, iMLL-ENL]","{'WT': [(185, 187)], 'iMLL-ENL': []}"
1175,800666,PRJNA800666,Next Generation Sequencing Facilitates Quantit...,Next Generation Sequencing Facilitates Quantit...,Purpose: Next-generation sequencing (NGS) has ...,800666,"[wild type, Sirt3K223R]","{'wild type': [(3001, 3010)], 'Sirt3K223R': [(..."
447,886455,PRJNA886455,FAM69C promotes stress granule assembly and su...,FAM69C promotes stress granule assembly and su...,Stress granule and inflammasome assembly deter...,886455,"[WT, Fam69c KO]","{'WT': [(1203, 1205)], 'Fam69c KO': []}"
1226,793829,PRJNA793829,Next Generation Sequencing Facilitates Quantit...,Next Generation Sequencing Facilitates Quantit...,Conclusion: Elf4fl/fl-Villin-Cre mice are more...,793829,"[wild type, Elf4fl/fl-Villin-Cre]","{'wild type': [], 'Elf4fl/fl-Villin-Cre': [(12..."
...,...,...,...,...,...,...,...,...
1130,805102,PRJNA805102,FAM111A is dispensable for electrolyte homeost...,FAM111A is dispensable for electrolyte homeost...,Autosomal dominant mutations in FAM111A are ca...,805102,"[Fam111a+/+, Fam111a-/-]","{'Fam111a+/+': [(1497, 1507)], 'Fam111a-/-': [..."
1294,785586,PRJNA785586,Single nucleus analysis of brain tissues from ...,Single nucleus analysis of brain tissues from ...,The TREM2-DAP12 receptor complex sustains micr...,785586,"[K{delta}75, WT]","{'K{delta}75': [], 'WT': [(1336, 1338)]}"
860,836544,PRJNA836544,Transcription factor Sp9 is a negative regulat...,Transcription factor Sp9 is a negative regulat...,The striatum is the main input structure of th...,836544,"[Dlx5/6-CIE, Dlx5/6-CIE; Rosa-Sp9-OE/+]","{'Dlx5/6-CIE': [(2220, 2230), (2255, 2265)], '..."
1459,767626,PRJNA767626,RNA sequencing of epidermal stem cells in Fbln...,RNA sequencing of epidermal stem cells in Fbln...,This study was performed to compare the transc...,767626,"[Fbln7 WT, Fbln7 KO]","{'Fbln7 WT': [(830, 838)], 'Fbln7 KO': []}"


In [4]:
ignore_genotypes = ['WT', 'wt', 'Wt', 'wild type', 'control', 'KO',
                    'wild-type', 'wildtype', 'Control', 'Wild type', 
                    'Wild-type', 'Wildtype', 'Wild Type', 'knockout']

In [21]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')

# Define the BIO labels
BIO_LABELS = ['B-GENOTYPE', 'I-GENOTYPE', 'O']


def split_list(lst, size):
    return [lst[i:i+size] for i in range(0, len(lst), size)]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [22]:
# Tokenize the text and do BIO labeling
def normal_tokenization(df):
    tokens = []
    labels = []
    token_label_pairs = []
    for i, row in df.iterrows():
        text = row['Description']
        genotype = row['genotype']
        tokenized_text = tokenizer.tokenize(text)
        token_labels = ['O'] * len(tokenized_text)
        for g in genotype:
            if g not in ignore_genotypes:
                g_tokens = tokenizer.tokenize(g)
                start_idx = 0
                for j, token in enumerate(tokenized_text):
                    if token == g_tokens[start_idx]:
                        if start_idx == len(g_tokens) - 1:
                            token_labels[j] = BIO_LABELS[0]
                        else:
                            token_labels[j] = BIO_LABELS[0]
                            start_idx += 1
                            while start_idx < len(g_tokens) and tokenized_text[j+1] == g_tokens[start_idx]:
                                token_labels[j+1] = BIO_LABELS[1]
                                j += 1
                                start_idx += 1
                            if start_idx == len(g_tokens):
                                break
        token_chunks = split_list(tokenized_text, 500)
        tokens.extend(token_chunks)
        label_chunks = split_list(token_labels, 500)
        labels.extend(label_chunks)
        token_label_pairs.extend([[(a, b) for a, b in zip(sublist_A, sublist_B)] for sublist_A, sublist_B in zip(token_chunks, label_chunks)])

    # Create a new dataframe with tokenized text and BIO labels
    new_df = pd.DataFrame({'tokens': tokens, 'labels': labels, 'token_label_pairs': token_label_pairs})
    return new_df

In [23]:
tok_train_df = normal_tokenization(train_df)
tok_test_df = normal_tokenization(test_df)
tok_val_df = normal_tokenization(val_df)

In [24]:
tok_train_df.iloc[2].token_label_pairs

[('purpose', 'O'),
 (':', 'O'),
 ('next', 'O'),
 ('-', 'O'),
 ('generation', 'O'),
 ('sequencing', 'O'),
 ('(', 'O'),
 ('ngs', 'O'),
 (')', 'O'),
 ('has', 'O'),
 ('revolution', 'O'),
 ('##ized', 'O'),
 ('systems', 'O'),
 ('-', 'O'),
 ('based', 'O'),
 ('analysis', 'O'),
 ('of', 'O'),
 ('cellular', 'O'),
 ('pathways', 'O'),
 ('.', 'O'),
 ('the', 'O'),
 ('goals', 'O'),
 ('of', 'O'),
 ('this', 'O'),
 ('study', 'O'),
 ('are', 'O'),
 ('to', 'O'),
 ('compare', 'O'),
 ('transcriptome', 'O'),
 ('##s', 'O'),
 ('of', 'O'),
 ('sirt', 'B-GENOTYPE'),
 ('##3', 'O'),
 ('wt', 'O'),
 ('and', 'O'),
 ('kr', 'O'),
 ('macrophages', 'O'),
 ('with', 'O'),
 ('high', 'O'),
 ('-', 'O'),
 ('throughput', 'O'),
 ('data', 'O'),
 ('analysis', 'O'),
 ('methods', 'O'),
 (':', 'O'),
 ('macrophages', 'O'),
 ("'", 'O'),
 ('mrna', 'O'),
 ('profiles', 'O'),
 ('of', 'O'),
 ('8', 'O'),
 ('weeks', 'O'),
 ('-', 'O'),
 ('old', 'O'),
 ('wild', 'O'),
 ('-', 'O'),
 ('type', 'O'),
 ('(', 'O'),
 ('wt', 'O'),
 (')', 'O'),
 ('and', 'O'

In [25]:
train_df.iloc[2].genotype

array(['wild type', 'Sirt3K223R'], dtype=object)

In [26]:
tok_train_df

Unnamed: 0,tokens,labels,token_label_pairs
0,"[we, performed, rnase, ##q, experiments, to, e...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(we, O), (performed, O), (rnase, O), (##q, O)..."
1,"[lin, ##28, ##b, suppresses, ml, ##l, -, enl, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(lin, O), (##28, O), (##b, O), (suppresses, O..."
2,"[purpose, :, next, -, generation, sequencing, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(purpose, O), (:, O), (next, O), (-, O), (gen..."
3,"[., the, optimized, data, analysis, workflow, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(., O), (the, O), (optimized, O), (data, O), ..."
4,"[stress, granule, and, inflammasome, assembly,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-GENO...","[(stress, O), (granule, O), (and, O), (inflamm..."
...,...,...,...
1116,"[the, trem, ##2, -, dap, ##12, receptor, compl...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(the, O), (trem, O), (##2, O), (-, O), (dap, ..."
1117,"[the, striatum, is, the, main, input, structur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(the, O), (striatum, O), (is, O), (the, O), (..."
1118,"[/, +, mice, and, dl, ##x, ##5, /, 6, -, ci, #...","[I-GENOTYPE, I-GENOTYPE, O, O, O, O, O, O, O, ...","[(/, I-GENOTYPE), (+, I-GENOTYPE), (mice, O), ..."
1119,"[this, study, was, performed, to, compare, the...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(this, O), (study, O), (was, O), (performed, ..."


In [27]:
print("Length of train dataframe:", len(tok_train_df))
print("Length of validation dataframe:", len(tok_test_df))
print("Length of test dataframe:", len(tok_val_df))

Length of train dataframe: 1121
Length of validation dataframe: 372
Length of test dataframe: 369


In [29]:
tok_train_df.to_pickle("../data/pubMed/normal/train.df")
tok_test_df.to_pickle("../data/pubMed/normal/test.df")
tok_val_df.to_pickle("../data/pubMed/normal/val.df")