<a href="https://colab.research.google.com/github/ekdls02/ekdls2025/blob/main/MAI_%EB%8D%B0%EC%9D%B4%EC%BD%98_%EA%B3%B5%EB%AA%A8%EC%A0%84(0_5).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 0. 구글 드라이브 연동
from google.colab import drive
import os
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 1. 라이브러리 import & 시드 설정
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {DEVICE}")

Device: cuda


In [None]:
# 2. 데이터 파일 경로 지정
test_file = '/content/test.csv'
submission_file = '/content/sample_submission.csv'

df_test = pd.read_csv(test_file)
sample_submission = pd.read_csv(submission_file)

print(f"Rows: {len(df_test):,}, Max seq length: {df_test['seq'].str.len().max()}")
print("Sample submission columns:", sample_submission.columns)


# 3. 모델 로드
MODEL_ID = "InstaDeepAI/nucleotide-transformer-v2-500m-multi-species"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForMaskedLM.from_pretrained(MODEL_ID, trust_remote_code=True)
model.to(DEVICE).eval()

Rows: 13,711, Max seq length: 1024
Sample submission columns: Index(['ID', 'emb_0000', 'emb_0001', 'emb_0002', 'emb_0003', 'emb_0004',
       'emb_0005', 'emb_0006', 'emb_0007', 'emb_0008',
       ...
       'emb_0758', 'emb_0759', 'emb_0760', 'emb_0761', 'emb_0762', 'emb_0763',
       'emb_0764', 'emb_0765', 'emb_0766', 'emb_0767'],
      dtype='object', length=769)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/101 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

esm_config.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- esm_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_esm.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/InstaDeepAI/nucleotide-transformer-v2-500m-multi-species:
- modeling_esm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.99G [00:00<?, ?B/s]

EsmForMaskedLM(
  (esm): EsmModel(
    (embeddings): EsmEmbeddings(
      (word_embeddings): Embedding(4107, 1024, padding_idx=1)
      (dropout): Dropout(p=0.0, inplace=False)
      (position_embeddings): Embedding(2050, 1024, padding_idx=1)
    )
    (encoder): EsmEncoder(
      (layer): ModuleList(
        (0-28): 29 x EsmLayer(
          (attention): EsmAttention(
            (self): EsmSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
              (rotary_embeddings): RotaryEmbedding()
            )
            (output): EsmSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (LayerNorm): LayerNorm((1024,), eps=1e-12

In [None]:
# 4. k-mer 증강 함수
def kmer_augment(seq, k=3, p=0.05):
    bases = ['A', 'C', 'G', 'T']
    seq_list = list(seq)
    for i in range(len(seq_list) - k + 1):
        if random.random() < p:
            for j in range(k):
                seq_list[i+j] = random.choice(bases)
    return ''.join(seq_list)

In [None]:
# 5. Dataset / DataLoader
class SeqDataset(Dataset):
    def __init__(self, df, augment=True):
        self.ids = df['ID'].tolist()
        self.seqs = df['seq'].tolist()
        self.augment = augment

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        seq = self.seqs[idx]
        return {'ID': self.ids[idx], 'seq': seq}

BATCH_SIZE = 8  # GPU 메모리에 맞춰 조정
dataset = SeqDataset(df_test, augment=True)

def collate_fn(batch):
    return batch  # 그대로 반환

loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [None]:
# 6. embedding 생성 함수 (배치 처리 + 여러 증강 + LayerNorm)
def get_full_trick_embedding_batch_fast(seqs, augment_times=1, window=512, stride=256):
    all_chunks = []
    map_back = []  # 각 chunk가 어느 seq로 돌아가는지

    # 1) augment + chunk 모두 모으기 (루프는 있지만 아주 가볍다)
    for seq_idx, seq in enumerate(seqs):
        for _ in range(augment_times):
            seq_aug = kmer_augment(seq)
            for start in range(0, len(seq_aug), stride):
                end = min(start + window, len(seq_aug))
                all_chunks.append(seq_aug[start:end])
                map_back.append(seq_idx)

    # 2) tokenizer를 chunk 수만큼 배치로 한 번에
    enc = tokenizer(
        all_chunks,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=window
    )
    input_ids = enc['input_ids'].to(DEVICE)
    attn_mask = enc['attention_mask'].to(DEVICE)

    # 3) 모델도 단 1번 호출
    with torch.no_grad():
        out = model(input_ids, attention_mask=attn_mask, output_hidden_states=True)
        last_layers = out.hidden_states[-4:]
        layer_weights = torch.tensor([0.4,0.3,0.2,0.1], device=DEVICE).view(4,1,1,1)
        last_hidden = (torch.stack(last_layers) * layer_weights).sum(0)

    # 4) chunk embedding 계산
    cls_emb = last_hidden[:, 0, :]
    lengths = attn_mask.sum(dim=1)
    center_emb = last_hidden[torch.arange(len(last_hidden)), lengths//2]
    end_emb = last_hidden[torch.arange(len(last_hidden)), lengths-1]
    chunk_emb = 0.5*cls_emb + 0.3*center_emb + 0.2*end_emb
    chunk_emb = torch.nn.functional.normalize(chunk_emb, dim=-1)

    # 5) seq 별 chunk 평균
    seq_embs = []
    for i in range(len(seqs)):
        idxs = [j for j, x in enumerate(map_back) if x == i]
        seq_emb = chunk_emb[idxs].mean(0)
        seq_embs.append(seq_emb)

    return torch.stack(seq_embs)


In [None]:
all_ids = []
all_embs = []

for batch in loader:
    ids = [b['ID'] for b in batch]
    seqs = [b['seq'] for b in batch]

    with torch.no_grad():
        emb_batch = get_full_trick_embedding_batch_fast(seqs, augment_times=1)

    all_ids.extend(ids)
    all_embs.append(emb_batch)

emb_tensor = torch.vstack(all_embs)
print(f"Final embedding shape: {emb_tensor.shape}")


Final embedding shape: torch.Size([13711, 1024])


In [None]:
# 8. submission 생성 (768차원 + normalize)
emb_tensor = emb_tensor[:, :768]
emb_tensor = torch.nn.functional.normalize(emb_tensor, dim=-1)
emb_df = pd.DataFrame(emb_tensor.cpu().numpy(), columns=[f'emb_{i:04d}' for i in range(768)])
submission = pd.concat([pd.Series(all_ids, name='ID'), emb_df], axis=1)

# 1) 코랩 세션에 저장
submission_path = 'full_trick_submission.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved at {submission_path}")

Submission saved at full_trick_submission.csv
