In [None]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [None]:
!pip install transformers tqdm pandas

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# specter 모델 불러오기 
tokenizer = AutoTokenizer.from_pretrained("allenai/specter")
model = AutoModel.from_pretrained("allenai/specter").to(device)

In [None]:
# mean pooling 함수 정의
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # (batch, seq_len, hidden)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

# 배치 임베딩 함수 
def embed_texts_batched(texts, tokenizer, model, batch_size=32, max_len=512):
    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding Progress"):
        batch_texts = texts[i:i+batch_size]
        encoded_input = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            model_output = model(**encoded_input)
        batch_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
        all_embeddings.append(batch_embeddings.cpu())  # CPU로 모아주기
    return torch.cat(all_embeddings, dim=0)


In [None]:
# title+abract 결합

df = pd.read_csv("cs_data.csv")

df_valid = df[df["title"].notna() & df["abstract"].notna()].copy()
combined_texts = (df_valid["title"] + " " + df_valid["abstract"]).tolist()

print("텍스트 개수:", len(combined_texts))

In [None]:
# 임베딩 실행 

embeddings = embed_texts_batched(combined_texts, tokenizer, model, batch_size=32)

print("임베딩 shape:", embeddings.shape)  # (문서 수, 768)

In [None]:
# 결과 저장
import numpy as np

np.save("specter_embeddings.npy", embeddings.numpy())

# DataFrame에 붙여서 CSV로 저장 (벡터를 리스트 형태로 저장)
df_valid["specter_embedding"] = embeddings.numpy().tolist()
df_valid.to_csv("papers_with_specter_embeddings.csv", index=False)