# scibert_scivocab_uncased

In [18]:
from transformers import AutoModel, AutoTokenizer
import torch

# 모델과 토크나이저 로드
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Mean Pooling 함수 정의
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # 마지막 hidden state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# SciBERT 임베딩 추출 함수
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return mean_pooling(outputs, inputs["attention_mask"])

# 메인 논문 및 인용 논문의 임베딩 계산
main_embedding = get_embedding("Network Structure and Knowledge Transfer: The Effects of Cohesion and Range")
citation_embeddings = [get_embedding(title) for title in [
    "Collaboration networks, structural holes, and innovation: A longitudinal study",
    "Who shall govern? CEO/board power, demographic similarity, and new director selection",
    "Analysis of a local-area wireless network",
]]

# 코사인 유사도 계산
cos_sim = lambda a, b: torch.nn.functional.cosine_similarity(a, b).item()
similarities = [cos_sim(main_embedding, emb) for emb in citation_embeddings]

# 결과 출력
import pandas as pd
df = pd.DataFrame({"Citation_Title": citations, "Similarity_Score": similarities})
df = df.sort_values(by="Similarity_Score", ascending=False)
print(df)


                                      Citation_Title  Similarity_Score
0  Collaboration networks, structural holes, and ...          0.846060
1  Who shall govern? CEO/board power, demographic...          0.719732
2          Analysis of a local-area wireless network          0.695151


# sentence-transformers/all-distilroberta-v1

In [19]:
import torch
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import os
from google.colab import auth

# Authenticate with Hugging Face (Only needed in Colab)
auth.authenticate_user()

# Set your Hugging Face token (optional for accessing private models)
os.environ['HF_TOKEN'] = ''

# ✅ Load the all-distilroberta-v1 model
model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")

# ✅ Define paper titles
main_paper = "Network Structure and Knowledge Transfer: The Effects of Cohesion and Range"
citations = [
    "Collaboration networks, structural holes, and innovation: A longitudinal study",
    "Who shall govern? CEO/board power, demographic similarity, and new director selection",
    "Analysis of a local-area wireless network",
]

# ✅ Convert titles into embeddings
main_embedding = model.encode(main_paper, convert_to_tensor=True)
citation_embeddings = model.encode(citations, convert_to_tensor=True)

# ✅ Compute cosine similarity scores
similarities = [util.pytorch_cos_sim(main_embedding, emb).item() for emb in citation_embeddings]

# ✅ Store results in a DataFrame
df = pd.DataFrame({"Citation_Title": citations, "Similarity_Score": similarities})
df = df.sort_values(by="Similarity_Score", ascending=False)

# ✅ Display ranked similarity scores
print(df)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

                                      Citation_Title  Similarity_Score
0  Collaboration networks, structural holes, and ...          0.564295
1  Who shall govern? CEO/board power, demographic...          0.284555
2          Analysis of a local-area wireless network          0.139067


# TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Define paper titles
main_paper = "Network Structure and Knowledge Transfer: The Effects of Cohesion and Range"
citations = [
    "Collaboration networks, structural holes, and innovation: A longitudinal study",
    "Who shall govern? CEO/board power, demographic similarity, and new director selection",
    "Analysis of a local-area wireless network",
]

# ✅ Combine the main paper with citations for TF-IDF processing
papers = [main_paper] + citations

# ✅ Create a TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# ✅ Fit and transform the papers to create the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(papers)

# ✅ Compute cosine similarity between the main paper and each citation
cos_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

# ✅ Store results in a DataFrame
df = pd.DataFrame({"Citation_Title": citations, "Similarity_Score": cos_similarities})
df = df.sort_values(by="Similarity_Score", ascending=False)

# ✅ Display ranked similarity scores
print(df)


                                      Citation_Title  Similarity_Score
2          Analysis of a local-area wireless network          0.172790
0  Collaboration networks, structural holes, and ...          0.095281
1  Who shall govern? CEO/board power, demographic...          0.076780


# Word2Vec

In [30]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load the pre-trained spaCy model (medium-sized word vectors)
nlp = spacy.load("en_core_web_md")

# Define paper titles
main_paper = "Network Structure and Knowledge Transfer: The Effects of Cohesion and Range"
citations = [
    "Collaboration networks, structural holes, and innovation: A longitudinal study",
    "Who shall govern? CEO/board power, demographic similarity, and new director selection",
    "Analysis of a local-area wireless network",
]

# Function to compute the vector for a sentence
def get_sentence_vector(sentence, model):
    doc = model(sentence)
    return doc.vector

# Convert titles into sentence vectors using spaCy
main_vector = get_sentence_vector(main_paper, nlp)
citation_vectors = [get_sentence_vector(citation, nlp) for citation in citations]

# Compute cosine similarity between the main paper and each citation
similarities = cosine_similarity([main_vector], citation_vectors).flatten()

# Store results in a DataFrame
df = pd.DataFrame({"Citation_Title": citations, "Similarity_Score": similarities})
df = df.sort_values(by="Similarity_Score", ascending=False)

# Display ranked similarity scores
print(df)


                                      Citation_Title  Similarity_Score
2          Analysis of a local-area wireless network          0.876180
0  Collaboration networks, structural holes, and ...          0.847832
1  Who shall govern? CEO/board power, demographic...          0.815084
