In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

def sentence_similarity(sentence1, sentence2):
    tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
    model = AutoModel.from_pretrained("monologg/kobert")

    # Tokenize the input sentences
    encoded_input1 = tokenizer(sentence1, return_tensors='pt')
    encoded_input2 = tokenizer(sentence2, return_tensors='pt')
    
    # Get the embeddings of the input sentences
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)

    # Calculate the cosine similarity of the sentences' embeddings
    cosine_sim = torch.nn.functional.cosine_similarity(model_output1[0], model_output2[0]).item()

    return cosine_sim

# Example usage
sentence1 = "이 음식은 맛있어."
sentence2 = "이 식사는 맛있어요."

similarity_score = sentence_similarity(sentence1, sentence2)

print("The similarity score between the two sentences is:", similarity_score)

Downloading (…)okenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

RuntimeError: a Tensor with 768 elements cannot be converted to Scalar

In [2]:
import csv
from sentence_transformers import SentenceTransformer, util
from transformers import PreTrainedTokenizer, PreTrainedModel, BertTokenizer, BertModel

# Load the KO-BERT model
model = SentenceTransformer('kykim/bert-kor-base')

# CSV file path
csv_file = '/home/fastcampus/Seokjun/csv/only_q.csv'

# Read base sentences from CSV
base_sentences = []
with open(csv_file, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        base_sentences.append(row[0])  # Assuming the sentence is in the first column of the CSV

# Given sentences
given_sentences = [
    "2019년 3월에 급격한 비급여매출 증가가 있었는데, 그 원인은 무엇일까요?",
    "2019년 7월과 2019년 10월에 비급여매출이 매우 높은 것으로 나타났는데, 해당 월에 어떠한 사업 전략이 있었을까요?",
    "2018년 10월과 2019년 1월에 비급여매출이 급증한 이유는 무엇일까요?",
    "2020년 2월에 비급여매출이 크게 증가한 이유를 파악할 수 있을까요?",
    "비급여매출 증가의 원인 중 국내외의 경제 상황은 어떤 영향을 미쳤을까요?"
    ]

# Encode the base and given sentences
base_embeddings = model.encode(base_sentences, convert_to_tensor=True)
given_embeddings = model.encode(given_sentences, convert_to_tensor=True)

# Calculate cosine similarities
similarities = util.cos_sim(given_embeddings, base_embeddings)

# Print the top 5 most similar base sentences for each given sentence
for i, given_sentence in enumerate(given_sentences):
    print(f"Top 5 most similar base sentences for '{given_sentence}':")
    similarity_scores = similarities[i]
    num_similarities = min(5, len(similarity_scores))
    top_indices = similarity_scores.argsort()[-num_similarities:][::-1]  # Indices of top similarities in descending order
    for idx in top_indices:
        base_sentence = base_sentences[idx]
        similarity = similarity_scores[idx].item()
        print(f"Base sentence '{base_sentence}': {similarity}")
    print()

AttributeError: 'BertModel' object has no attribute 'encode'

In [6]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

# CSV 파일에서 데이터 로드
data = pd.read_csv('/home/fastcampus/Seokjun/csv/only_q.csv')

# KoBERT 모델 및 tokenizer 로드
tokenizer = AutoTokenizer.from_pretrained("monologg/kobert")
model = AutoModel.from_pretrained("monologg/kobert")

# 새로운 문장 입력
new_sentence = "2019년 3월에 급격한 비급여매출 증가가 있었는데, 그 원인은 무엇일까요?"

# 입력 문장의 임베딩 계산
encoded_input = tokenizer(new_sentence, return_tensors='pt')
with torch.no_grad():
    model_output = model(**encoded_input)

# 새로운 문장과 기존 문장들 간의 유사도 계산
similarities = []
for i, row in data.iterrows():
    encoded_input2 = tokenizer(row['질문'], return_tensors='pt')
    with torch.no_grad():
        model_output2 = model(**encoded_input2)
    similarity = torch.nn.functional.cosine_similarity(model_output[0], model_output2[0], dim=1).item()
    similarities.append(similarity)

# 유사도가 높은 상위 5개의 기존 문장 출력
similarities_df = pd.DataFrame({'질문': data['질문'], '유사도': similarities})
top_5 = similarities_df.nlargest(5, '유사도')
print("새로운 문장: ", new_sentence)
print("상위 5개의 기존 문장: ")
print(top_5)

RuntimeError: The size of tensor a (13) must match the size of tensor b (31) at non-singleton dimension 1

In [8]:
import numpy as np
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine

tokenizer = AutoTokenizer.from_pretrained("KLUE/bert-base")
model = AutoModel.from_pretrained("KLUE/bert-base")

def get_sentence_embedding(sentence):
    if not isinstance(sentence, str):
        #print("문장은 문자열로 입력해주세요.")
        pass
        return None

    if not sentence.strip():
        #print("비어있는 문자열은 처리할 수 없습니다.")
        pass
        return None

    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(axis=1).detach().numpy()
    return embeddings.reshape(-1)

input_questions = [
    "2019년 3월에 급격한 비급여매출 증가가 있었는데, 그 원인은 무엇일까요?",
    "2019년 7월과 2019년 10월에 비급여매출이 매우 높은 것으로 나타났는데, 해당 월에 어떠한 사업 전략이 있었을까요?",
    "2018년 10월과 2019년 1월에 비급여매출이 급증한 이유는 무엇일까요?",
    "2020년 2월에 비급여매출이 크게 증가한 이유를 파악할 수 있을까요?",
    "비급여매출 증가의 원인 중 국내외의 경제 상황은 어떤 영향을 미쳤을까요?"
    ]

questions_df = pd.read_csv("/home/fastcampus/Seoknam/only_question.csv")
questions = questions_df["질문"].tolist()

def get_top5_similar_questions(embed_question, questions):
    if embed_question is None:
        return [], []

    similarities = []

    for question in questions:
        embed = get_sentence_embedding(question)
        if embed is not None:
            similarity = 1 - cosine(embed_question, embed)
        else:
            similarity = 0
        similarities.append(similarity)

    top5_indexes = np.argsort(similarities)[-5:][::-1]
    top5_questions = [questions[i] for i in top5_indexes]
    top5_similarities = [similarities[i] for i in top5_indexes]

    return top5_questions, top5_similarities

def print_top5_similar_questions(input_question, embed_input_question, questions):
    top5_questions, top5_similarities = get_top5_similar_questions(embed_input_question, questions)

    print(f"\n{input_question}와 가장 유사한 5개 질문:")
    for i, (question, similarity) in enumerate(zip(top5_questions, top5_similarities)):
        print(f"Top{i + 1} 유사 질문: {question} (유사도: {similarity:.2f})")

for input_question in input_questions:
    embed_input_question = get_sentence_embedding(input_question)
    print_top5_similar_questions(input_question, embed_input_question, questions)


Some weights of the model checkpoint at KLUE/bert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
