## 유전자 변이 해석
Get Data "wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" to CMD

### Settings

#### Model Load

In [30]:
import torch
import pandas as pd
from datasets import Dataset
import numpy as np
from evaluate import load
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from itertools import chain


#### Set GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### EDA

In [3]:
file_path = "./variant_summary.csv"
df = pd.read_csv(file_path, sep="\t", low_memory=False)

df.dropna(inplace=True)

print(df.head())

   #AlleleID                       Type  \
0      15041                      Indel   
1      15041                      Indel   
2      15042                   Deletion   
3      15042                   Deletion   
4      15043  single nucleotide variant   

                                                Name  GeneID GeneSymbol  \
0  NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...    9907      AP5Z1   
1  NM_014855.3(AP5Z1):c.80_83delinsTGCTGTAAACTGTA...    9907      AP5Z1   
2     NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs)    9907      AP5Z1   
3     NM_014855.3(AP5Z1):c.1413_1426del (p.Leu473fs)    9907      AP5Z1   
4       NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg)    9640     ZNF592   

      HGNC_ID    ClinicalSignificance  ClinSigSimple LastEvaluated  \
0  HGNC:22197              Pathogenic              1  Jun 25, 2024   
1  HGNC:22197              Pathogenic              1  Jun 25, 2024   
2  HGNC:22197              Pathogenic              1  Jun 29, 2010   
3  HGNC:22

### Make Train Sentence

In [4]:
# Train Data Generate
sentences = []

# Filter
Filter = ["not specified", "not provided", "unknown"]

# Template
templates = [
    "Mutations in the {gene} gene can cause {disease}.",
    "Variations in the {gene} gene are associated with {disease}.",
    "Abnormalities in the {gene} gene may contribute to {disease}.",
    "Specific mutations in the {gene} gene can increase the risk of {disease}.",
    "Certain variants of the {gene} gene are frequently observed in patients with {disease}."
    ]

unique_gene_disease = set()
for _, row in df.iterrows():
    gene = row["GeneSymbol"]
    disease = row["PhenotypeList"].split("|")[0]  # "Hereditary spastic paraplegia 48|not provided" -> "Hereditary spastic paraplegia 48"

    if disease.lower() not in Filter:
        unique_gene_disease.add((gene, disease))

for gene, disease in unique_gene_disease:
    for template in templates:
        sentence = template.format(gene = gene, disease = disease)
        sentences.append((sentence, gene, disease))

for sentence in sentences[:5]:
    print(sentence)

('Mutations in the BTBD3 gene can cause See cases.', 'BTBD3', 'See cases')
('Variations in the BTBD3 gene are associated with See cases.', 'BTBD3', 'See cases')
('Abnormalities in the BTBD3 gene may contribute to See cases.', 'BTBD3', 'See cases')
('Specific mutations in the BTBD3 gene can increase the risk of See cases.', 'BTBD3', 'See cases')
('Certain variants of the BTBD3 gene are frequently observed in patients with See cases.', 'BTBD3', 'See cases')


### Token Tagging

In [5]:
def convert_to_bio(sentence, gene, disease):
    words = sentence.split()
    labels = ["O"] * len(words)

    # Gene Tagging
    for i, word in enumerate(words):
        if word == gene:
            labels[i] = "B-GENE"
    
    # Disease Tagging
    disease_words = disease.split()
    for i, word in enumerate(words):
        if word in disease_words:
            labels[i] = "B-DISEASE" if i == 0 else "I-DISEASE"
    
    return list(zip(words, labels))

In [6]:
bio_sentences = [convert_to_bio(sentence, gene, disease) for sentence, gene, disease in sentences]

for bio in bio_sentences[:5]:
    print(bio)

[('Mutations', 'O'), ('in', 'O'), ('the', 'O'), ('BTBD3', 'B-GENE'), ('gene', 'O'), ('can', 'O'), ('cause', 'O'), ('See', 'I-DISEASE'), ('cases.', 'O')]
[('Variations', 'O'), ('in', 'O'), ('the', 'O'), ('BTBD3', 'B-GENE'), ('gene', 'O'), ('are', 'O'), ('associated', 'O'), ('with', 'O'), ('See', 'I-DISEASE'), ('cases.', 'O')]
[('Abnormalities', 'O'), ('in', 'O'), ('the', 'O'), ('BTBD3', 'B-GENE'), ('gene', 'O'), ('may', 'O'), ('contribute', 'O'), ('to', 'O'), ('See', 'I-DISEASE'), ('cases.', 'O')]
[('Specific', 'O'), ('mutations', 'O'), ('in', 'O'), ('the', 'O'), ('BTBD3', 'B-GENE'), ('gene', 'O'), ('can', 'O'), ('increase', 'O'), ('the', 'O'), ('risk', 'O'), ('of', 'O'), ('See', 'I-DISEASE'), ('cases.', 'O')]
[('Certain', 'O'), ('variants', 'O'), ('of', 'O'), ('the', 'O'), ('BTBD3', 'B-GENE'), ('gene', 'O'), ('are', 'O'), ('frequently', 'O'), ('observed', 'O'), ('in', 'O'), ('patients', 'O'), ('with', 'O'), ('See', 'I-DISEASE'), ('cases.', 'O')]


In [7]:
# NER Train Data Save
with open("ner_dataset.txt", "w", encoding="utf-8") as f:
    for bio in bio_sentences:
        for word, tag in bio:
            f.write(f"{word} {tag}\n")
        f.write("\n")

In [8]:
'''
"O" : Nothing
"B-GENE" : Gene Token(Begin)
"I-GENE" : Gene Token(Inside)
"B-DISEASE" : Disease Token(Begin)
"I-DISEASE" : Disease Token(Inside)
'''

label_list = ["O", "B-GENE", "I-GENE", "B-DISEASE", "I-DISEASE"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

### Load Train Data

In [9]:
# Sentence combine
def load_ner_data(file_path):
    # Restore Sentence
    sentences = []
    # Words in Sentence
    words = []
    # Tag in Sentence
    tags = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            # config ""(Finish Sentence)
            if not line:
                if words:
                    sentences.append((words, tags))
                    words = []
                    tags = []
            else:
                token, label = line.split()
                words.append(token)
                tags.append(label)
        # If Last Sentence not Finish with "" Preprocess
        if words:
            sentences.append((words, tags))
    return sentences

In [10]:
file_path = "./ner_dataset.txt"
dataset = load_ner_data(file_path)

### Train, Valid, Test Split

In [11]:
# Split train : valid : test = 7 : 1 : 2
train_data, test_data = train_test_split(dataset, test_size=0.3, random_state=42)
valid_data, test_data = train_test_split(test_data, test_size=0.7, random_state=42)

print(f"train : valid :  test : {len(train_data)} : {len(valid_data)} : {len(test_data)}")

train : valid :  test : 189108 : 24314 : 56733


In [12]:
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Convert HuggingFace Dataset

In [13]:
def tokenize_and_align_labels(examples):
    words = examples['words']
    tags = examples['tags']
    
    # Word to SubWord(Tokenizing)
    tokenized = tokenizer(
        words,
        truncation=True,
        padding = "max_length",
        max_length = 128,
        is_split_into_words=True
    )

    # Subword Match Origin Word Label
    label_ids = []
    word_ids = tokenized.word_ids(batch_index=0)

    previous_word_idx = None
    previous_label = "O"

    for word_idx in word_ids:
        # If token [CLS], [SEP], [PAD], ignore
        if word_idx is None:
            label_ids.append(-100)
        else:
            current_label = tags[word_idx]
            
            # If Subword seperate GENE, label -> (B-GENE, I-GENE,...)
            if word_idx != previous_word_idx:
                if current_label.startswith("B-") and previous_label == current_label:
                    i_label = "I-" + current_label.split("-")[1]
                    label_ids.append(label2id[i_label])
                else:
                    label_ids.append(label2id[current_label])
            else:
                label_ids.append(label2id[current_label])

            previous_word_idx = word_idx
            previous_label = tags[word_idx]

    tokenized["labels"] = label_ids

    return tokenized

### Convert dataset for put Trainer

In [14]:
def convert_to_hf_dataset(split_data):
    hf_format = []
    for words, tags in split_data:
        hf_format.append({"words": words, "tags": tags})
    return Dataset.from_list(hf_format)

hf_train = convert_to_hf_dataset(train_data)
hf_valid = convert_to_hf_dataset(valid_data)
hf_test = convert_to_hf_dataset(test_data)

# Subtokenizing(map for apply all sentence and label)
hf_train = hf_train.map(tokenize_and_align_labels)
hf_valid = hf_valid.map(tokenize_and_align_labels)
hf_test = hf_test.map(tokenize_and_align_labels)

# Convert Pytorch Tensor
hf_train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
hf_valid.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
hf_test.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/189108 [00:00<?, ? examples/s]

Map:   0%|          | 0/24314 [00:00<?, ? examples/s]

Map:   0%|          | 0/56733 [00:00<?, ? examples/s]

In [15]:
training_args = TrainingArguments(
    output_dir="./bio_ner_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)



### Evaluation Metric

In [16]:
metric = load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    pred_labels = [[id2label[p] for p, l in zip(pred, label) if l != -100] for pred, label in zip(predictions, labels)]

    results = metric.compute(predictions=pred_labels, references=true_labels)
    return {
        "f1": results["overall_f1"],
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "accuracy": results["overall_accuracy"]
    }

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_valid,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,Accuracy
1,0.0,0.001056,0.999667,0.999681,0.999652,0.999859
2,0.0058,0.000717,0.999696,0.999764,0.999628,0.999872
3,0.0,0.000311,0.999879,0.9999,0.999858,0.999953


TrainOutput(global_step=70917, training_loss=0.002171633182793003, metrics={'train_runtime': 9237.3021, 'train_samples_per_second': 61.417, 'train_steps_per_second': 7.677, 'total_flos': 3.706098547567104e+16, 'train_loss': 0.002171633182793003, 'epoch': 3.0})

### Test Data Inference

In [None]:
def evaluate_model(model, dataset, id2label):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=8)
    model.eval()
    
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)  
            attention_mask = batch["attention_mask"].to(device)  
            labels = batch["labels"].to(device)  

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=2)

            # Transform True Label and Pred Label
            for i in range(len(labels)):
                true_seq = [id2label[l.item()] for l in labels[i] if l.item() != -100]
                pred_seq = [id2label[p.item()] for p, l in zip(preds[i], labels[i]) if l.item() != -100]

                true_labels.append(true_seq)
                predictions.append(pred_seq)

    # List of List to List
    flat_true_labels = list(chain(*true_labels))
    flat_predictions = list(chain(*predictions))

    print(classification_report(flat_true_labels, flat_predictions, zero_division=0))


In [31]:
evaluate_model(model, hf_test, id2label)

              precision    recall  f1-score   support

   B-DISEASE       0.00      0.00      0.00         2
      B-GENE       1.00      1.00      1.00    336423
   I-DISEASE       1.00      1.00      1.00    385285
           O       1.00      1.00      1.00    727733

    accuracy                           1.00   1449443
   macro avg       0.75      0.75      0.75   1449443
weighted avg       1.00      1.00      1.00   1449443



### Save Model

In [32]:
trainer.save_model("final_ner_model")
tokenizer.save_pretrained("final_ner_model")

('final_ner_model\\tokenizer_config.json',
 'final_ner_model\\special_tokens_map.json',
 'final_ner_model\\vocab.txt',
 'final_ner_model\\added_tokens.json',
 'final_ner_model\\tokenizer.json')

### Model Test

In [None]:
def predict_ner(sentence):
    tokenized_input = tokenizer(
        sentence,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    model.eval()
    with torch.no_grad():
        output = model(**tokenized_input)

    # Predict
    logits = output.logits
    predictions = torch.argmax(logits, dim=2).squeeze().tolist()

    # ID to Token
    tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"].squeeze().tolist())
    predicted_labels = [model.config.id2label[p] for p in predictions]

    return list(zip(tokens, predicted_labels))

In [34]:
saved_model_path = "final_ner_model"

In [37]:
tokenizer = AutoTokenizer.from_pretrained(saved_model_path)
model = AutoModelForTokenClassification.from_pretrained(saved_model_path)

model.to(device)

text = "The BRCA1 gene is associated with breast cancer."

predict = predict_ner(text)
print(predict)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[('[CLS]', 'B-GENE'), ('the', 'O'), ('br', 'B-GENE'), ('##ca', 'B-GENE'), ('##1', 'B-GENE'), ('gene', 'O'), ('is', 'O'), ('associated', 'O'), ('with', 'O'), ('breast', 'I-DISEASE'), ('cancer', 'O'), ('.', 'O'), ('[SEP]', 'B-GENE')]
