In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import pad_sequences


if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')


2023-07-29 15:22:04.792799: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-29 15:22:04.942118: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
success_df = pd.read_pickle("merged.df")

In [3]:
ignore_genotypes = ['WT', 'wt', 'Wt', 'wild type', 'control', 'KO',
                    'wild-type', 'wildtype', 'Control', 'Wild type', 
                    'Wild-type', 'Wildtype', 'Wild Type', 'knockout']

In [4]:
# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the BIO labels
BIO_LABELS = ['B-GENOTYPE', 'I-GENOTYPE', 'O']


def split_list(lst, size):
    return [lst[i:i+size] for i in range(0, len(lst), size)]

# Tokenize the text and do BIO labeling
tokens = []
labels = []
token_label_pairs = []
for i, row in success_df.iterrows():
    text = row['Description']
    genotype = row['genotype']
    tokenized_text = tokenizer.tokenize(text)
    token_labels = ['O'] * len(tokenized_text)
    for g in genotype:
        if g not in ignore_genotypes:
            g_tokens = tokenizer.tokenize(g)
            start_idx = 0
            for j, token in enumerate(tokenized_text):
                if token == g_tokens[start_idx]:
                    if start_idx == len(g_tokens) - 1:
                        token_labels[j] = BIO_LABELS[0]
                    else:
                        token_labels[j] = BIO_LABELS[0]
                        start_idx += 1
                        while start_idx < len(g_tokens) and tokenized_text[j+1] == g_tokens[start_idx]:
                            token_labels[j+1] = BIO_LABELS[1]
                            j += 1
                            start_idx += 1
                        if start_idx == len(g_tokens):
                            break
    token_chunks = split_list(tokenized_text, 500)
    tokens.extend(token_chunks)
    label_chunks = split_list(token_labels, 500)
    labels.extend(label_chunks)
    token_label_pairs.extend([[(a, b) for a, b in zip(sublist_A, sublist_B)] for sublist_A, sublist_B in zip(token_chunks, label_chunks)])

# Create a new dataframe with tokenized text and BIO labels
new_df = pd.DataFrame({'tokens': tokens, 'labels': labels, 'token_label_pairs': token_label_pairs})

In [5]:
new_df.iloc[2].token_label_pairs

[('eco', 'O'),
 ('##tro', 'O'),
 ('##pic', 'O'),
 ('viral', 'O'),
 ('integration', 'O'),
 ('site', 'O'),
 ('1', 'O'),
 ('(', 'O'),
 ('ev', 'B-GENOTYPE'),
 ('##i', 'I-GENOTYPE'),
 ('##1', 'I-GENOTYPE'),
 ('/', 'O'),
 ('me', 'O'),
 ('##com', 'O'),
 (')', 'O'),
 ('over', 'B-GENOTYPE'),
 ('##ex', 'I-GENOTYPE'),
 ('##press', 'O'),
 ('##ion', 'O'),
 ('is', 'O'),
 ('common', 'O'),
 ('in', 'O'),
 ('my', 'O'),
 ('##elo', 'O'),
 ('##id', 'O'),
 ('mali', 'O'),
 ('##gnan', 'O'),
 ('##cies', 'O'),
 ('.', 'O'),
 ('we', 'O'),
 ('present', 'O'),
 ('a', 'O'),
 ('new', 'O'),
 ('ev', 'O'),
 ('##i', 'O'),
 ('##1', 'O'),
 ('trans', 'O'),
 ('##genic', 'O'),
 ('mouse', 'O'),
 ('model', 'O'),
 ('with', 'O'),
 ('ind', 'O'),
 ('##ucible', 'O'),
 ('expression', 'O'),
 ('in', 'O'),
 ('hem', 'O'),
 ('##ato', 'O'),
 ('##po', 'O'),
 ('##ie', 'O'),
 ('##tic', 'O'),
 ('stem', 'O'),
 ('cells', 'O'),
 ('(', 'O'),
 ('hs', 'O'),
 ('##cs', 'O'),
 (')', 'O'),
 ('and', 'O'),
 ('pro', 'O'),
 ('##gen', 'O'),
 ('##itor', 'O'),


In [6]:
success_df.iloc[2].genotype

array(['WT', 'Evi1 overexpressing'], dtype=object)

In [7]:
new_df.to_pickle("tokens.df")