In [1]:
import polars as pl
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
df = pl.read_csv("./csv_data//deoksan.csv")

In [23]:
col_lis = list(df.columns)
print(col_lis)

['time', 'curr', 'currR', 'currS', 'currT', 'Ground', 'PT100', 'Vibra', 'Volt', 'VoltR', 'VoltS', 'VoltT']


In [24]:
col_ontology_classes = ['speed', 'speedR', 'speedS', 'speedT', 'spd', 'spdR', 'spdS', 'spdT', 'spd_avg', 'spd_avgR', 'spd_avgS', 'spd_avgT']

In [5]:
# 2. 엉망인 파일명 리스트 (Input Data)
filenames = [
    "Inj_Machine_Log_01.csv",       # 약어 사용
    "Plastic_Molding_Data.csv",     # 동의어 사용
    "Robot_Arm_Axis_X.csv",         # 구체적 부품
    "Auto_Welder_Final.csv",        # 용접기
    "Factory_Pump_Vib.csv",         # 펌프
    "M01_Unknown.csv",               # 의미 불명 (매핑 실패 예상)
    "CNC_Machine_Data.csv",
    "cnc.csv",
    "cnc_data.csv",
    "cnc_data_01.csv",
    "cnc_data_02.csv",
    "cnc_data_03.csv",
    "cnc_data_04.csv",
    
]

# 3. 온톨로지 표준 클래스 정의 (Target Classes)
# AI가 파일명을 보고 이 중 하나를 골라야 함
ontology_classes = [
    "Injection_Molding_Machine", # 사출기
    "Welding_Robot",             # 용접 로봇
    "Industrial_Pump",           # 펌프
    "CNC_Machine",               # CNC
    "Conveyor_Belt"              # 컨베이어
]

In [20]:
def preprocess_filename(fname):
    # 확장자 제거 및 특수문자를 공백으로 변환하여 '문장'처럼 만듦
    name = fname.replace('.csv', '').replace('_', ' ').replace('-', ' ')
    return name
def preprocess_columns(fname):
    # 확장자 제거 및 특수문자를 공백으로 변환하여 '문장'처럼 만듦
    name = fname.replace('.csv', '').replace('_', ' ').replace('-', ' ')
    return name

In [21]:
import pandas as pd
def map_filenames_to_classes(files, classes):
    # 임베딩
    clean_names = [preprocess_filename(f) for f in files]
    embeddings_files = model.encode(clean_names)
    embeddings_classes = model.encode(classes)
    
    # 유사도 계산
    scores = util.cos_sim(embeddings_files, embeddings_classes)
    
    results = []
    for i, fname in enumerate(files):
        best_idx = scores[i].argmax().item()
        best_score = scores[i][best_idx].item()
        best_class = classes[best_idx]
        
        # 유사도가 낮으면(0.4 미만) 분류 보류
        category = best_class if best_score > 0.4 else "Unclassified"
        
        results.append({
            "Filename": fname,
            "Interpreted_As": clean_names[i],
            "Mapped_Class": category,
            "Confidence": round(best_score, 3)
        })
        
    return pd.DataFrame(results)


def map_columns_to_classes(col_names, classes):
    # 임베딩
    clean_names = [preprocess_columns(f) for f in col_names]
    embeddings_files = model.encode(clean_names)
    embeddings_classes = model.encode(classes)
    
    # 유사도 계산
    scores = util.cos_sim(embeddings_files, embeddings_classes)
    
    results = []
    for i, fname in enumerate(col_names):
        best_idx = scores[i].argmax().item()
        best_score = scores[i][best_idx].item()
        best_class = classes[best_idx]
        
        # 유사도가 낮으면(0.4 미만) 분류 보류
        category = best_class if best_score > 0.4 else "Unclassified"
        
        results.append({
            "Column_name": fname,
            "Interpreted_As": clean_names[i],
            "Mapped_Class": category,
            "Confidence": round(best_score, 3)
        })
        
    return pd.DataFrame(results)

In [26]:
filename_results = map_filenames_to_classes(filenames, ontology_classes)

In [27]:
column_name_results = map_columns_to_classes(col_lis, col_ontology_classes)

In [28]:
column_name_results

Unnamed: 0,Column_name,Interpreted_As,Mapped_Class,Confidence
0,time,time,speed,0.402
1,curr,curr,Unclassified,0.265
2,currR,currR,Unclassified,0.353
3,currS,currS,Unclassified,0.284
4,currT,currT,Unclassified,0.335
5,Ground,Ground,Unclassified,0.345
6,PT100,PT100,Unclassified,0.377
7,Vibra,Vibra,Unclassified,0.291
8,Volt,Volt,speedT,0.465
9,VoltR,VoltR,Unclassified,0.365
