## 유전자 변이 해석
Get Data "wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" to CMD for NER   
Get Data "wget ftp://ftp.ncbi.nlm.nih.gov/pub/lu/BioRED/BIORED.zip" to CMD for RE

### Settings

#### Model Load

In [2]:
import torch
from datasets import Dataset
import numpy as np
from evaluate import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from itertools import chain


#### Set GPU

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### EDA

In [4]:
'''
file_path = "./variant_summary.csv"
df = pd.read_csv(file_path, sep="\t", low_memory=False)

df.dropna(inplace=True)

print(df.head())'
'''

'\nfile_path = "./variant_summary.csv"\ndf = pd.read_csv(file_path, sep="\t", low_memory=False)\n\ndf.dropna(inplace=True)\n\nprint(df.head())\'\n'

### Make Train Sentence

In [5]:
'''
# Train Data Generate
sentences = []

# Filter
Filter = ["not specified", "not provided", "unknown"]

# Template
templates = [
    "Mutations in the {gene} gene can cause {disease}.",
    "Variations in the {gene} gene are associated with {disease}.",
    "Abnormalities in the {gene} gene may contribute to {disease}.",
    "Specific mutations in the {gene} gene can increase the risk of {disease}.",
    "Certain variants of the {gene} gene are frequently observed in patients with {disease}."
    ]

unique_gene_disease = set()
for _, row in df.iterrows():
    gene = row["GeneSymbol"]
    disease = row["PhenotypeList"].split("|")[0]  # "Hereditary spastic paraplegia 48|not provided" -> "Hereditary spastic paraplegia 48"

    if disease.lower() not in Filter:
        unique_gene_disease.add((gene, disease))

for gene, disease in unique_gene_disease:
    for template in templates:
        sentence = template.format(gene = gene, disease = disease)
        sentences.append((sentence, gene, disease))

for sentence in sentences[:5]:
    print(sentence)
'''

'\n# Train Data Generate\nsentences = []\n\n# Filter\nFilter = ["not specified", "not provided", "unknown"]\n\n# Template\ntemplates = [\n    "Mutations in the {gene} gene can cause {disease}.",\n    "Variations in the {gene} gene are associated with {disease}.",\n    "Abnormalities in the {gene} gene may contribute to {disease}.",\n    "Specific mutations in the {gene} gene can increase the risk of {disease}.",\n    "Certain variants of the {gene} gene are frequently observed in patients with {disease}."\n    ]\n\nunique_gene_disease = set()\nfor _, row in df.iterrows():\n    gene = row["GeneSymbol"]\n    disease = row["PhenotypeList"].split("|")[0]  # "Hereditary spastic paraplegia 48|not provided" -> "Hereditary spastic paraplegia 48"\n\n    if disease.lower() not in Filter:\n        unique_gene_disease.add((gene, disease))\n\nfor gene, disease in unique_gene_disease:\n    for template in templates:\n        sentence = template.format(gene = gene, disease = disease)\n        sentenc

### Token Tagging

In [6]:
'''
def convert_to_bio(sentence, gene, disease):
    words = sentence.split()
    labels = ["O"] * len(words)

    # Gene Tagging
    for i, word in enumerate(words):
        if word == gene:
            labels[i] = "B-GENE"
    
    # Disease Tagging
    disease_words = disease.split()
    for i, word in enumerate(words):
        if word in disease_words:
            labels[i] = "B-DISEASE" if i == 0 else "I-DISEASE"
    
    return list(zip(words, labels))
'''

'\ndef convert_to_bio(sentence, gene, disease):\n    words = sentence.split()\n    labels = ["O"] * len(words)\n\n    # Gene Tagging\n    for i, word in enumerate(words):\n        if word == gene:\n            labels[i] = "B-GENE"\n    \n    # Disease Tagging\n    disease_words = disease.split()\n    for i, word in enumerate(words):\n        if word in disease_words:\n            labels[i] = "B-DISEASE" if i == 0 else "I-DISEASE"\n    \n    return list(zip(words, labels))\n'

In [7]:
'''
bio_sentences = [convert_to_bio(sentence, gene, disease) for sentence, gene, disease in sentences]

for bio in bio_sentences[:5]:
    print(bio)
'''

'\nbio_sentences = [convert_to_bio(sentence, gene, disease) for sentence, gene, disease in sentences]\n\nfor bio in bio_sentences[:5]:\n    print(bio)\n'

In [8]:
# NER Train Data Save
'''
with open("ner_dataset.txt", "w", encoding="utf-8") as f:
    for bio in bio_sentences:
        for word, tag in bio:
            f.write(f"{word} {tag}\n")
        f.write("\n")'
'''

'\nwith open("ner_dataset.txt", "w", encoding="utf-8") as f:\n    for bio in bio_sentences:\n        for word, tag in bio:\n            f.write(f"{word} {tag}\n")\n        f.write("\n")\'\n'

In [9]:
'''
"O" : Nothing
"B-GENE" : Gene Token(Begin)
"I-GENE" : Gene Token(Inside)
"B-DISEASE" : Disease Token(Begin)
"I-DISEASE" : Disease Token(Inside)
'''

label_list = ["O", "B-GENE", "I-GENE", "B-DISEASE", "I-DISEASE"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

### Load Train Data

In [10]:
# Sentence combine
def load_ner_data(file_path):
    # Restore Sentence
    sentences = []
    # Words in Sentence
    words = []
    # Tag in Sentence
    tags = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            # config ""(Finish Sentence)
            if not line:
                if words:
                    sentences.append((words, tags))
                    words = []
                    tags = []
            else:
                token, label = line.split()
                words.append(token)
                tags.append(label)
        # If Last Sentence not Finish with "" Preprocess
        if words:
            sentences.append((words, tags))
    return sentences

In [11]:
file_path = "./ner_dataset.txt"
dataset = load_ner_data(file_path)

### Train, Valid, Test Split

In [12]:
# Split train : valid : test = 7 : 1 : 2
train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.7, random_state=42)

print(f"train : valid :  test : {len(train_data)} : {len(valid_data)} : {len(test_data)}")

train : valid :  test : 189108 : 24314 : 56733


In [13]:
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Convert HuggingFace Dataset

In [14]:
def tokenize_and_align_labels(examples):
    words = examples['words']
    tags = examples['tags']
    
    # Word to SubWord(Tokenizing)
    tokenized = tokenizer(
        words,
        truncation=True,
        padding = "max_length",
        max_length = 128,
        is_split_into_words=True
    )

    # Subword Match Origin Word Label
    label_ids = []
    word_ids = tokenized.word_ids(batch_index=0)

    previous_word_idx = None
    previous_label = "O"

    for word_idx in word_ids:
        # If token [CLS], [SEP], [PAD], ignore
        if word_idx is None:
            label_ids.append(-100)
        else:
            current_label = tags[word_idx]
            
            # If Subword seperate GENE, label -> (B-GENE, I-GENE,...)
            if word_idx != previous_word_idx:
                if current_label.startswith("B-") and previous_label == current_label:
                    i_label = "I-" + current_label.split("-")[1]
                    label_ids.append(label2id[i_label])
                else:
                    label_ids.append(label2id[current_label])
            else:
                label_ids.append(label2id[current_label])

            previous_word_idx = word_idx
            previous_label = tags[word_idx]

    tokenized["labels"] = label_ids

    return tokenized

### Convert dataset for put Trainer

In [15]:
def convert_to_hf_dataset(split_data):
    hf_format = []
    for words, tags in split_data:
        hf_format.append({"words": words, "tags": tags})
    return Dataset.from_list(hf_format)

hf_train = convert_to_hf_dataset(train_data)
hf_valid = convert_to_hf_dataset(valid_data)
hf_test = convert_to_hf_dataset(test_data)

# Subtokenizing(map for apply all sentence and label)
hf_train = hf_train.map(tokenize_and_align_labels)
hf_valid = hf_valid.map(tokenize_and_align_labels)
hf_test = hf_test.map(tokenize_and_align_labels)

# Convert Pytorch Tensor
hf_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
hf_valid.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
hf_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/189108 [00:00<?, ? examples/s]

Map:   0%|          | 0/24314 [00:00<?, ? examples/s]

Map:   0%|          | 0/56733 [00:00<?, ? examples/s]

In [16]:
training_args = TrainingArguments(
    output_dir="./bio_ner_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)



### Evaluation Metric

In [17]:
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    results = metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "f1": results["overall_f1"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "accuracy": results["overall_accuracy"]
    }

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# trainer.train()

  trainer = Trainer(


### Test Data Inference

In [19]:
def evaluate_model(model, dataset, id2label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
    model.eval()
    
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)  
            attention_mask = batch["attention_mask"].to(device)  
            labels = batch["labels"].to(device)  

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=2)

            # Transform True Label and Pred Label
            for i in range(len(labels)):
                true_seq = [id2label[l.item()] for l in labels[i] if l.item() != -100]
                pred_seq = [id2label[p.item()] for p, l in zip(preds[i], labels[i]) if l.item() != -100]

                true_labels.append(true_seq)
                predictions.append(pred_seq)

    # List of List to List
    flat_true_labels = list(chain(*true_labels))
    flat_predictions = list(chain(*predictions))

    print(classification_report(flat_true_labels, flat_predictions, zero_division=0))


In [20]:
# evaluate_model(model, hf_test, id2label)

### Save Model

In [21]:
# trainer.save_model("final_ner_model")
# tokenizer.save_pretrained("final_ner_model")

### Model Test

In [22]:
def predict_ner(sentence):
    tokenized_input = tokenizer(
        sentence,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    model.eval()
    with torch.no_grad():
        output = model(**tokenized_input)

    # Predict
    logits = output.logits
    predictions = torch.argmax(logits, dim=2).squeeze().tolist()

    # ID to Token
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"].squeeze().tolist())
    predicted_labels = [model.config.id2label[p] for p in predictions]

    return list(zip(tokens, predicted_labels))

In [23]:
saved_model_path = "final_ner_model"

In [24]:
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
model = AutoModelForTokenClassification.from_pretrained(saved_model_path)

model.to(device)

text = "The BRCA1 gene is associated with breast cancer."

predict = predict_ner(text)
print(predict)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('[CLS]', 'B-GENE'), ('the', 'O'), ('br', 'B-GENE'), ('##ca', 'B-GENE'), ('##1', 'B-GENE'), ('gene', 'O'), ('is', 'O'), ('associated', 'O'), ('with', 'O'), ('breast', 'I-DISEASE'), ('cancer', 'O'), ('.', 'O'), ('[SEP]', 'B-GENE')]


### Extract NER Result

In [28]:
ner_result = predict
entities = []
current_entity = None
for token, label in ner_result:
    if label.startswith('B-'):
        if current_entity:
            entities.append(current_entity)
            current_entity = None
        current_entity = {'label': label[2:], 'text': token}
    elif label.startswith('I-') and current_entity and current_entity['label'] == label[2:]:
        current_entity['text'] += token.replace("##", "")
    else:
        if current_entity:
            entities.append(current_entity)
            current_entity = None

if current_entity:
    entities.append(current_entity)

print(entities)

[{'label': 'GENE', 'text': '[CLS]'}, {'label': 'GENE', 'text': 'br'}, {'label': 'GENE', 'text': '##ca'}, {'label': 'GENE', 'text': '##1'}, {'label': 'GENE', 'text': '[SEP]'}]


In [None]:
gene = [entity['text'] for entity in entities if entity['label'] == 'GENE']
disease = [entity['text'] for entity in entities if entity['label'] == 'DISEASE']

['[CLS]', 'br', '##ca', '##1', '[SEP]']