In [30]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForTokenClassification
from transformers import BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


In [31]:
train_df = pd.read_pickle("../data/train.df")
test_df = pd.read_pickle("../data/test.df")
val_df = pd.read_pickle("../data/val.df")

In [32]:
ignore_genotypes = ['WT', 'wt', 'Wt', 'wild type', 'control', 'KO',
                    'wild-type', 'wildtype', 'Control', 'Wild type', 
                    'Wild-type', 'Wildtype', 'Wild Type', 'knockout']

In [40]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')

# Define the BIO labels
BIO_LABELS = ['B-GENOTYPE', 'I-GENOTYPE', 'O']


def split_list(lst, size):
    return [lst[i:i+size] for i in range(0, len(lst), size)]

In [41]:
# Tokenize the text and do BIO labeling
def offset_tokenization(df):
    tokens = []
    offsets = []
    index_offsets = []
    labels = []
    token_label_pairs = []
    for i, row in df.iterrows():
        text = row['Description']
        genotype = row['genotype']
        indexes = row['index_dict']
        train_encodings = tokenizer(text, return_offsets_mapping=True)
        tokenized_text = tokenizer.tokenize(text)
        token_labels = ['O'] * len(tokenized_text)
        # Get the character level offsets
        offsets = train_encodings.offset_mapping[1:-1]
        for key, value in indexes.items():
            if key not in ignore_genotypes:
                for v in value:
                    start = v[0]
                    end = v[1]
                    # Loop through each character level offset in the offsets
                    for k in range(len(offsets)):
                        offset_start = offsets[k][0]
                        offset_end = offsets[k][1]
                        if start <= offset_start and end >= offset_end:
        #                     print(row['tokens'][k], offsets[k], start, offset_start, end, offset_end)
                            if offset_start == start:
                                token_labels[k] = 'B-GENOTYPE'
                            else:
                                token_labels[k] = 'I-GENOTYPE'
        token_chunks = split_list(tokenized_text, 500)
        tokens.extend(token_chunks)
        label_chunks = split_list(token_labels, 500)
        labels.extend(label_chunks)
        token_label_pairs.extend([[(a, b) for a, b in zip(sublist_A, sublist_B)] for sublist_A, sublist_B in zip(token_chunks, label_chunks)])


    # Create a new dataframe with tokenized text and BIO labels
    new_df = pd.DataFrame({'tokens': tokens, 'labels': labels, 'token_label_pairs': token_label_pairs})
    return new_df

In [42]:
tok_train_df = offset_tokenization(train_df)
tok_test_df = offset_tokenization(test_df)
tok_val_df = offset_tokenization(val_df)

In [43]:
tok_train_df.iloc[2].token_label_pairs

[('purpose', 'O'),
 (':', 'O'),
 ('next', 'O'),
 ('-', 'O'),
 ('generation', 'O'),
 ('sequencing', 'O'),
 ('(', 'O'),
 ('ngs', 'O'),
 (')', 'O'),
 ('has', 'O'),
 ('revolution', 'O'),
 ('##ized', 'O'),
 ('systems', 'O'),
 ('-', 'O'),
 ('based', 'O'),
 ('analysis', 'O'),
 ('of', 'O'),
 ('cellular', 'O'),
 ('pathways', 'O'),
 ('.', 'O'),
 ('the', 'O'),
 ('goals', 'O'),
 ('of', 'O'),
 ('this', 'O'),
 ('study', 'O'),
 ('are', 'O'),
 ('to', 'O'),
 ('compare', 'O'),
 ('transcriptome', 'O'),
 ('##s', 'O'),
 ('of', 'O'),
 ('sirt', 'O'),
 ('##3', 'O'),
 ('wt', 'O'),
 ('and', 'O'),
 ('kr', 'O'),
 ('macrophages', 'O'),
 ('with', 'O'),
 ('high', 'O'),
 ('-', 'O'),
 ('throughput', 'O'),
 ('data', 'O'),
 ('analysis', 'O'),
 ('methods', 'O'),
 (':', 'O'),
 ('macrophages', 'O'),
 ("'", 'O'),
 ('mrna', 'O'),
 ('profiles', 'O'),
 ('of', 'O'),
 ('8', 'O'),
 ('weeks', 'O'),
 ('-', 'O'),
 ('old', 'O'),
 ('wild', 'O'),
 ('-', 'O'),
 ('type', 'O'),
 ('(', 'O'),
 ('wt', 'O'),
 (')', 'O'),
 ('and', 'O'),
 ('sir

In [44]:
train_df.iloc[2].genotype

array(['wild type', 'Sirt3K223R'], dtype=object)

In [45]:
tok_train_df

Unnamed: 0,tokens,labels,token_label_pairs
0,"[we, performed, rnase, ##q, experiments, to, e...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(we, O), (performed, O), (rnase, O), (##q, O)..."
1,"[lin, ##28, ##b, suppresses, ml, ##l, -, enl, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(lin, O), (##28, O), (##b, O), (suppresses, O..."
2,"[purpose, :, next, -, generation, sequencing, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(purpose, O), (:, O), (next, O), (-, O), (gen..."
3,"[., the, optimized, data, analysis, workflow, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(., O), (the, O), (optimized, O), (data, O), ..."
4,"[stress, granule, and, inflammasome, assembly,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(stress, O), (granule, O), (and, O), (inflamm..."
...,...,...,...
1116,"[the, trem, ##2, -, dap, ##12, receptor, compl...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(the, O), (trem, O), (##2, O), (-, O), (dap, ..."
1117,"[the, striatum, is, the, main, input, structur...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(the, O), (striatum, O), (is, O), (the, O), (..."
1118,"[/, +, mice, and, dl, ##x, ##5, /, 6, -, ci, #...","[I-GENOTYPE, I-GENOTYPE, O, O, B-GENOTYPE, I-G...","[(/, I-GENOTYPE), (+, I-GENOTYPE), (mice, O), ..."
1119,"[this, study, was, performed, to, compare, the...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[(this, O), (study, O), (was, O), (performed, ..."


In [46]:
print("Length of train dataframe:", len(tok_train_df))
print("Length of validation dataframe:", len(tok_test_df))
print("Length of test dataframe:", len(tok_val_df))

Length of train dataframe: 1121
Length of validation dataframe: 372
Length of test dataframe: 369


In [47]:
tok_train_df.to_pickle("../data/pubMed/offset/train.df")
tok_test_df.to_pickle("../data/pubMed/offset/test.df")
tok_val_df.to_pickle("../data/pubMed/offset/val.df")