이 파일은 HSCODE 6자리가 매핑되어 있지 않아 외부 데이터를 통해 매핑한 결과입니다.

6자리 매핑 시에 자연어 유사도를 비교하여 매핑해야 했기에, 이미 매핑되어 있는 데이터로(약 3000개) 미리 학습시킨 SBERT를 사용했습니다.

### HSCODE_6자리 매핑

1. 기업 DSC 인코딩
2. 외부 HS2017 데이터 인코딩
3. HSCODE 2자리 단위씩 반복 매핑

### 기업 DSC 임베딩 불러오기
파인튜닝 시킨 모델로 임베딩 추출 후 불러오기

In [None]:
from sentence_transformers import SentenceTransformer
# 인코딩에 사용할 모델 불러오기
model = SentenceTransformer('ISIC_HSCODE6_Bert_all')

In [None]:
# encoded_DSC 파일 생성

import pandas as pd

file_path = '비식별된 해외기업별 영문 텍스트데이터.csv'
df = pd.read_csv(file_path)
encoded_DSC = model.encode(df['DSC'].tolist())
df['DSC_encoded'] = list(encoded_DSC)

df['DSC_encoded'] = df['DSC_encoded'].apply(lambda x: ','.join(map(str,x)))
df.to_csv('encoded_DSC.csv', index=False)

In [None]:
DSC_df = pd.read_csv('encoded_DSC.csv')
DSC_df['DSC_encoded'] = DSC_df['DSC_encoded'].apply(lambda x: list(map(float, x.split(','))))
DSC_df.head()

Unnamed: 0,ID,CODE,DSC,DSC_encoded
0,1,4520,"automotive repair shops, nec specialized auto...","[0.0012444641, -0.4897798, 0.78584, -0.5226077..."
1,2,149,"general farms, primarily animals, nsk derives...","[-0.6222493, -0.42845285, -0.5563288, 0.403768..."
2,3,4630,fish and seafoods the wholesale distribution ...,"[0.06667073, -0.70483065, 0.7621674, -0.387332..."
3,4,4510,"new and used car dealers, nsk manufactures a ...","[0.004534301, -0.5023982, 0.75971717, -0.36580..."
4,4,2930,"automotive stampings, nsk manufacturing autom...","[-0.3222424, -0.17184584, 0.9447302, 0.0660150..."


### HS2017 임베딩 불러오기
2자리 단위로 트리구조 저장

In [None]:
""" 2자리 단위로 트리 구조를 생성하기 위한 트리 생성 코드 """

class Node:
  def __init__(self, key, text, value):
    self.key = key
    self.text = text
    self.value = value
    self.children = []

  def add_child(self, child_node):
    self.children.append(child_node)

  def __repr__(self):
    return f"Node(key={self.key}, text={self.text})"

class Tree:
  def __init__(self, root):
    self.root = root

  def add_node(self, parent_key, key, text, value):
    parent_node = self.find_node(self.root, parent_key)
    if parent_node:
      new_node = Node(key, text, value)
      parent_node.add_child(new_node)
    else:
      print(f"Parent with key {parent_key} not found")

  def find_node(self, current_node, key):
    if current_node.key == key:
      return current_node
    for child in current_node.children:
      result = self.find_node(child, key)
      if result:
        return result
    return None

  def __repr__(self):
    return f"Tree(root={self.root})"

In [None]:
""" HS2017 데이터를 (hscode, text, embedding)로 트리에 저장  """

import requests

url = 'https://comtradeapi.un.org/files/v1/app/reference/H5.json'
data = requests.get(url).json()

file_path = 'encoded_outdata.csv'
loaded_outdata = pd.read_csv(file_path)
loaded_outdata['embeddings'] = loaded_outdata['embeddings'].apply(lambda x: list(map(float, x.split(','))))

root = Node(key='0', text='root', value=[])

tree = Tree(root)

hs2_list = [item['text'] for item in data['results'] if len(item['id']) == 2]
hs4_list = [item['text'] for item in data['results'] if len(item['id']) == 4]
hs6_list = [item['text'] for item in data['results'] if len(item['id']) == 6]

hs2_code = [item.split(" - ", 1)[0] for item in hs2_list]
hs2_text = [item.split(" - ", 1)[1] for item in hs2_list]
hs4_code = [item.split(" - ", 1)[0] for item in hs4_list]
hs4_text = [item.split(" - ", 1)[1] for item in hs4_list]
hs6_code = [item.split(" - ", 1)[0] for item in hs6_list]
hs6_text = [item.split(" - ", 1)[1] for item in hs6_list]

hs_code = hs2_code + hs4_code + hs6_code
hs_text = hs2_text + hs4_text + hs6_text

# text를 인코딩하여 저장
hs2_embeddings = model.encode(hs2_text)
hs4_embeddings = model.encode(hs4_text)
hs6_embeddings = model.encode(hs6_text)
hs_embeddings = []
hs_embeddings.extend(hs2_embeddings)
hs_embeddings.extend(hs4_embeddings)
hs_embeddings.extend(hs6_embeddings)


for i in range(len(hs_code)):
  p_key = "0"
  if len(hs_code[i]) != 2:
    p_key = hs_code[i][:-2]
  tree.add_node(p_key, hs_code[i], hs_text[i], hs_embeddings[i])

### HSCODE_6 매핑하기
2자리 단위로 2, 4, 6 범위로 늘려가며 DSC와 HSCODE 매핑
2자리 단위마다 가장 큰 코사인유사도 값의 80%를 threshold로 설정

In [None]:
""" rank : [hscode, cosine_similarity]가 원소인 2차원 리스트
    가장 큰 cosine_similarity의 80%를 threshold로 설정
    threshold 이상의 값만 남긴 후 리턴 """


def print_rank(rank):
  rank.sort(key=lambda x:x[1], reverse=True)
  threshold_ratio = 0.80
  max_value = rank[0][1]
  threshold = max_value * threshold_ratio

  top_rank = [item[0] for item in rank if item[1] >= threshold]
  return top_rank

In [None]:
""" 코사인 유사도 계산을 위한 GPU 활용 """

import torch
torch.cuda.is_available()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
""" rank2는 hscode 앞 2자리, rank4는 앞 4자리, rank6는 6자리 """


def matching_6(S):
  rank2 = []
  embedding1 = torch.tensor(S).to(device).unsqueeze(0)
  embedding1 = embedding1.to(device)
  for hs2 in tree.root.children:
    embedding2 = torch.tensor(hs2.value).to(device).unsqueeze(0)
    embedding2 = embedding2.to(device)
    tmp = torch.nn.functional.cosine_similarity(embedding1, embedding2)
    rank2.append([hs2.key, tmp.item()])

  top_rank2 = print_rank(rank2)



  rank4 = []
  for hs2 in tree.root.children:
    if hs2.key in top_rank2:
      for hs4 in hs2.children:
        embedding2 = torch.tensor(hs4.value).to(device).unsqueeze(0)
        embedding2 = embedding2.to(device)
        tmp = torch.nn.functional.cosine_similarity(embedding1, embedding2)
        rank4.append([hs4.key, tmp.item()])

  top_rank4 = print_rank(rank4)

  rank6 = []
  for hs2 in tree.root.children:
    if hs2.key in top_rank2:
      for hs4 in hs2.children:
        if hs4.key in top_rank4:
          for hs6 in hs4.children:
            embedding2 = torch.tensor(hs6.value).to(device).unsqueeze(0)
            embedding2 = embedding2.to(device)
            tmp = torch.nn.functional.cosine_similarity(embedding1, embedding2)
            rank6.append([hs6.key, tmp.item()])

  top_rank6 = print_rank(rank6)
  return top_rank6

In [None]:
from tqdm import tqdm

final = []
for index, row in tqdm(DSC_df.iterrows(), total=DSC_df.shape[0], desc="Processing rows"):
  final.append([index, row['CODE'], matching_6(row['DSC_encoded'])])

final

Processing rows: 100%|██████████| 10000/10000 [15:41<00:00, 10.62it/s]


[[0,
  4520,
  ['960400',
   '911190',
   '860800',
   '870530',
   '970500',
   '999999',
   '911220',
   '911019',
   '911290',
   '961800',
   '870590',
   '911180',
   '870510',
   '870290',
   '960500',
   '750300',
   '831000',
   '860400',
   '911090',
   '830300']],
 [1,
  149,
  ['020621',
   '020610',
   '020629',
   '020622',
   '051110',
   '020680',
   '020690',
   '020711',
   '020712',
   '050210',
   '020714',
   '020713',
   '041000',
   '020210',
   '020630',
   '020110',
   '020230',
   '020220',
   '020442',
   '020120',
   '020422',
   '020130',
   '020443',
   '020741',
   '020860',
   '020810',
   '020423',
   '020430',
   '020760',
   '020410',
   '020450',
   '020421',
   '020441',
   '020641',
   '020742',
   '020744',
   '020830',
   '020649',
   '040891',
   '040811',
   '020745',
   '051199',
   '020751',
   '020500',
   '040899',
   '050290',
   '020752',
   '020990',
   '020743',
   '020754',
   '020910',
   '230990',
   '230910',
   '020755',
   '020724'

In [None]:
import csv

file_path = 'match_bertall_85.csv'

with open(file_path, mode='w', newline='') as file:
  writer = csv.writer(file)
  for item in final:
    writer.writerow([item[0], item[1], ', '.join(map(str, item[2]))])

In [None]:
""" 평균 매핑된 HSOCODE 수 """

sum = 0
for i in final:
  sum += len(i[2])
sum/len(final)

45.1122