## 유전자 변이 해석 및 유전자-질병 관계 분석


### Settings

#### Model Load

In [None]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

#### Set Seed

In [2]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

#### Set GPU

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Genetic Mutation Information Extraction

#### Model Inference Test

In [None]:
'''
BioBERT
 - Model for NER(Named Entity Recognition)
 - Text to GENE, VARIANT, DISEASE
'''

# Base BioBERT Model
model_name = "dmis-lab/biobert-base-cased-v1.1"

# Model and Tokenizer Load
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels = 9).to(device)
model.eval()

# Sample Text
text = "The BRCA1 gene mutation increases the risk of breast cancer."

# Input Text Tokenizing BioBERT Format(Result is Num which Token Match)
tokens = tokenizer(text, return_tensors = "pt")
input_ids = tokens['input_ids'].to(device)
attention_mask = tokens['attention_mask'].to(device)

# Model Test Mode
with torch.no_grad():
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    predictions = torch.argmax(outputs.logits, dim = 2)

id2label = {
    0: "O",          # Nothing
    1: "B-GENE",     # Begin Gene Entity
    2: "I-GENE",     # Inside Gene Entity
    3: "B-VARIANT",  # Beginn Variant Entity
    4: "I-VARIANT",  # Inside Variant Entity
    5: "B-DISEASE",  # Beginn Disease Entity
    6: "I-DISEASE",  # Inside Disease Entity
    7: "B-MUTATION", # Beginning Mutation Entity
    8: "I-MUTATION"  # Inside Mutation Entity
}

# Restore Num to Token and Predict Entity
tokens_list = tokenizer.convert_ids_to_tokens(input_ids[0])
predicted_labels = [id2label[p.item()] for p in predictions[0]]

# Result(Token-Label Matching)
for token, label in zip(tokens_list, predicted_labels):
    print(f"{token}: {label}")

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'input_ids': tensor([[  101,  1103,  9304,  2599,  1475,  5565, 17895,  6986,  1103,  3187,
          1104,  7209,  4182,   119,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
[CLS]: B-GENE
the: B-DISEASE
br: B-VARIANT
##ca: B-VARIANT
##1: O
gene: B-DISEASE
mutation: I-GENE
increases: B-DISEASE
the: B-GENE
risk: B-DISEASE
of: B-GENE
breast: B-GENE
cancer: B-DISEASE
.: B-DISEASE
[SEP]: I-MUTATION


#### Dataset Loader

In [None]:
class VariantDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

#### Model Train

In [9]:
import torch, pandas as pd, numpy as np, re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 시드 및 장치 설정
def set_seed(seed=42):
    torch.manual_seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"사용 장치: {device}")

# 데이터셋 클래스
class VariantDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts; self.labels = labels
        self.tokenizer = tokenizer; self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(self.texts[idx], add_special_tokens=True,
                             max_length=self.max_len, padding='max_length',
                             truncation=True, return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].flatten(),
            'attention_mask': enc['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# 모델, 데이터 로드, 학습/평가, 추론 클래스
class VariantInterpreter:
    def __init__(self, model_name="dmis-lab/biobert-base-cased-v1.1"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
        self.model.to(device)
        self.id2label = {0:"Pathogenic", 1:"Benign", 2:"Likely_Pathogenic", 3:"Likely_Benign", 4:"VUS"}
    def map_label(self, cs):
        cs = str(cs).lower()
        if 'pathogenic' in cs and 'likely' not in cs: return 0
        elif 'benign' in cs and 'likely' not in cs: return 1
        elif 'likely pathogenic' in cs: return 2
        elif 'likely benign' in cs: return 3
        else: return 4
    def load_data(self, sample_size=200):
        url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt"
        df = pd.read_csv(url, sep="\t", low_memory=False)
        cols = ['VariationID', 'GeneSymbol', 'ClinicalSignificance']
        df = df.dropna(subset=cols).sample(n=sample_size, random_state=42)
        df['text'] = df.apply(lambda r: f"Variant: {r['VariationID']} Gene: {r['GeneSymbol']} ClinicalSignificance: {r['ClinicalSignificance']}", axis=1)
        df['label'] = df['ClinicalSignificance'].apply(self.map_label)
        return train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)
    def train(self, batch_size=8, epochs=3, lr=2e-5):
        train_texts, test_texts, train_labels, test_labels = self.load_data()
        train_ds = VariantDataset(train_texts, train_labels, self.tokenizer)
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        opt = torch.optim.AdamW(self.model.parameters(), lr=lr)
        for epoch in range(epochs):
            self.model.train()
            total_loss = 0
            for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
                opt.zero_grad()
                input_ids = batch['input_ids'].to(device)
                att_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                out = self.model(input_ids=input_ids, attention_mask=att_mask, labels=labels)
                loss = out.loss; total_loss += loss.item()
                loss.backward(); opt.step()
            print(f"Epoch {epoch+1} 손실: {total_loss/len(train_loader):.4f}")
        self.evaluate(test_texts, test_labels)
    def evaluate(self, texts, labels, batch_size=8):
        ds = VariantDataset(texts, labels, self.tokenizer)
        loader = DataLoader(ds, batch_size=batch_size)
        self.model.eval()
        true, pred = [], []
        with torch.no_grad():
            for batch in tqdm(loader, desc="Evaluating"):
                input_ids = batch['input_ids'].to(device)
                att_mask = batch['attention_mask'].to(device)
                out = self.model(input_ids=input_ids, attention_mask=att_mask)
                pred.extend(torch.argmax(out.logits, dim=1).cpu().numpy())
                true.extend(batch['labels'].cpu().numpy())
        print(classification_report(true, pred, labels=[0,1,2,3,4],
              target_names=[self.id2label[i] for i in range(5)]))
        print("정확도:", accuracy_score(true, pred))
    def predict(self, text):
        self.model.eval()
        enc = self.tokenizer(text, add_special_tokens=True, max_length=128,
                             padding='max_length', truncation=True, return_tensors='pt')
        with torch.no_grad():
            out = self.model(input_ids=enc['input_ids'].to(device),
                             attention_mask=enc['attention_mask'].to(device))
            pred = torch.argmax(out.logits, dim=1).item()
            conf = torch.softmax(out.logits, dim=1)[0][pred].item()
        return {"classification": self.id2label[pred], "confidence": conf}

def main():
    interpreter = VariantInterpreter()
    interpreter.train()
    test_text = "Variant: 123456 Gene: BRCA1 ClinicalSignificance: Pathogenic"
    result = interpreter.predict(test_text)
    print("\n입력:", test_text)
    print("분류:", result["classification"], f"(신뢰도: {result['confidence']:.2f})")

if __name__ == "__main__":
    main()

사용 장치: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


URLError: <urlopen error [Errno 11002] getaddrinfo failed>