In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

In [3]:
with open("columns.txt", "r", encoding="utf-8") as f:
    columns = f.read().split("\n")
embeddings = model.encode(columns)

In [8]:
keywords = ["시험 중간고사 기말고사", "attendance 출석", "과제 숙제 HW", "참여도 참석 참여 attendance", "발표 토론", "프로젝트 project", "협동 team", "보고서 essay", "실험", "태도"]
label = ["시험", "출석", "과제", "참여도", "발표", "프로젝트", "협동", "보고서", "실험", "태도", "기타"]

# 3. 임베딩
keyword_emb = model.encode(keywords)

# 4. 유사도 계산 (각 필드 vs 각 키워드)
similarities = cosine_similarity(embeddings, keyword_emb)
max_indices = np.argmax(similarities, axis=1)
max_values = np.max(similarities, axis=1)

# 5. 가장 유사한 키워드의 index로 라벨링
labels = []
scores = []
for idx, val in zip(max_indices, max_values):
    if val >= 0.38:
        labels.append(label[idx])
    else:
        labels.append("기타")
    scores.append(val)

df = pd.DataFrame()
df["field"] = columns
df["label"] = labels
df["scores"] = scores

In [9]:
df

Unnamed: 0,field,label,scores
0,출석,출석,0.611375
1,과제,시험,0.662664
2,중간고사,시험,0.693817
3,기말고사,시험,0.685169
4,참여,참여도,0.789256
...,...,...,...
4045,기말발표및과제,시험,0.639371
4046,탐구발표 및 출석,참여도,0.672374
4047,과제 및 탐구발표,실험,0.633757
4048,4 Homework,과제,0.716719


In [18]:
import pickle

field_to_label = dict(zip(df["field"], df["label"]))

# pickle로 저장
with open("./평가방식.pickle", "wb") as f:
    pickle.dump(field_to_label, f)