In [2]:
from dotenv import load_dotenv
load_dotenv()

import os
import re
import csv
import json
import time
import numpy as np
import pandas as pd
from pprint import pprint

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import JSONLoader
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_community.document_transformers import LongContextReorder
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_openai import OpenAIEmbeddings
from langchain_upstage import ChatUpstage
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [3]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
UPSTAGE_API_KEY = os.environ.get('UPSTAGE_API_KEY')
LANGCHAIN_API_KEY = os.environ.get('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_PROJECT'] = 'matching_model_EXP04' # 프로젝트명 수정
LANGCHAIN_PROJECT = os.environ.get('LANGCHAIN_PROJECT')

print(f'> LangSmith Project: {LANGCHAIN_PROJECT}')

> LangSmith Project: matching_model_EXP04


# 데이터 구성

> 전처리, csv to jsonl

In [4]:
# 데이터 로드

text = pd.read_excel('../data/비식별된 해외기업별 영문 텍스트데이터.xlsx')
statis = pd.read_excel('../data/통계청 국제표준산업분류 HSCODE 6단위 매핑.xlsx')
customs = pd.read_excel('../data/관세청_HS부호_240101.xlsx')

text_copy = text.copy()
statis_copy = statis.copy()
customs_copy = customs.copy()

print('> 데이터 로드 완료')


# 데이터 전처리

def zero_input(num, x):
    if pd.isna(x):
        return np.nan
    else:
        cnt = num - len(x)
        return '0' * cnt + x
    
def re_sub(x):
    if pd.isna(x):
        return np.nan
    else:
        return re.sub(r'^\((.*?)\)$', r'\1', x)

text_copy['ID'] = text_copy['ID'].astype(str)
text_copy['CODE'] = text_copy['CODE'].astype(str)
text_copy['CODE'] = text_copy['CODE'].apply(lambda x: zero_input(4, x))

statis_copy.columns = [
    'ISIC4_CODE', # ISIC4_국제표준산업분류
    'ISIC4_NAME', # ISIC4_분류명
    'KSIC10_CODE', # KSIC10_한국표준산업분류
    'KSIC10_NAME', # KSIC10_분류명
    'HS2017_CODE', # HS2017_관세통계통합품목분류
    'HS2017_NAME' # HS2017_분류명
]

statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].astype(str)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].replace('nan', np.nan)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].str.replace('.0', '', regex=False)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].apply(lambda x: zero_input(4, x))

statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].astype(str)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].replace('nan', np.nan)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].str.replace('.0', '', regex=False)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].apply(lambda x: zero_input(6, x))

customs_copy.columns = [
    'HS_CODE', # HS부호
    'KOR_NAME', # 한글품목명
    'ENG_NAME', # 영문품목명
    'INT_CODE', # 성질통합분류코드
    'INT_NAME' # 성질통합분류명
]

customs_copy['HS_CODE'] = customs_copy['HS_CODE'].astype(str)
customs_copy['HS_CODE'] = customs_copy['HS_CODE'].apply(lambda x: zero_input(10, x))

customs_copy['INT_CODE'] = customs_copy['INT_CODE'].astype(str)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].replace('nan', np.nan)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].str.replace('.0', '', regex=False)

customs_copy['INT_NAME'] = customs_copy['INT_NAME'].apply(lambda x: re_sub(x))

text_copy = text_copy.fillna(' ')
statis_copy = statis_copy.fillna(' ')
customs_copy = customs_copy.fillna(' ')

print('> 데이터 전처리 완료')
print('> 데이터 결측치 확인')
print('-----' * 5)
print(text_copy.isnull().sum())
print(statis_copy.isnull().sum())
print(customs_copy.isnull().sum())
print('-----' * 5)


# 데이터 저장 및 로드

text_copy.to_csv('../data/prepro_text.csv', index=False, encoding='utf-8')
statis_copy.to_csv('../data/prepro_statis.csv', index=False, encoding='utf-8')
customs_copy.to_csv('../data/prepro_customs.csv', index=False, encoding='utf-8')

text_prepro = pd.read_csv('../data/prepro_text.csv', dtype=str)
statis_prepro = pd.read_csv('../data/prepro_statis.csv', dtype=str)
customs_prepro = pd.read_csv('../data/prepro_customs.csv', dtype=str)


# csv to jsonl

def csv_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        with open(jsonl_file_path, mode='w', encoding='utf-8') as jsonl_file:
            for row in csv_reader:
                jsonl_file.write(json.dumps(row, ensure_ascii=False) + '\n')

csv_to_jsonl('../data/prepro_text.csv', '../data/jsonl_prepro_text.jsonl')
csv_to_jsonl('../data/prepro_statis.csv', '../data/jsonl_prepro_statis.jsonl')
csv_to_jsonl('../data/prepro_customs.csv', '../data/jsonl_prepro_customs.jsonl')
print('> csv to jsonl 완료')

> 데이터 로드 완료
> 데이터 전처리 완료
> 데이터 결측치 확인
-------------------------
ID      0
CODE    0
DSC     0
dtype: int64
ISIC4_CODE     0
ISIC4_NAME     0
KSIC10_CODE    0
KSIC10_NAME    0
HS2017_CODE    0
HS2017_NAME    0
dtype: int64
HS_CODE     0
KOR_NAME    0
ENG_NAME    0
INT_CODE    0
INT_NAME    0
dtype: int64
-------------------------
> csv to jsonl 완료


# Document 구성

## text

In [5]:
file_path = '../data/jsonl_prepro_text.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
text_documents = []
for data in temp:
    doc = Document(
        page_content=data['DSC'], 
        metadata={
            'ID': data['ID'],
            'CODE': data['CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl',
            'seq_num': seq_num,
        }
    )
    text_documents.append(doc)
    seq_num += 1

pprint(text_documents[0])
print(text_documents[0].page_content)
pprint(text_documents[0].metadata)

Document(page_content='automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of motor vehicles maintenance and repair of motor vehicles maintenance and repair of motor vehiclesother automotive repair and maintenance', metadata={'ID': '1', 'CODE': '4520', 'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl', 'seq_num': 1})
automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of moto

## statis

In [6]:
file_path = '../data/jsonl_prepro_statis.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
statis_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['ISIC4_NAME']}\r\n{data['KSIC10_NAME']}\r\n{data['HS2017_NAME']}", # ISIC4, KSIC10, HS2017 순으로 작성됨
        metadata={
            'ISIC4_CODE': data['ISIC4_CODE'],
            'KSIC10_CODE': data['KSIC10_CODE'],
            'HS2017_CODE': data['HS2017_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl',
            'seq_num': seq_num,
        }
    )
    statis_documents.append(doc)
    seq_num += 1

pprint(statis_documents[0])
print(statis_documents[0].page_content)
pprint(statis_documents[0].metadata)

Document(page_content='곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자', metadata={'ISIC4_CODE': '0111', 'KSIC10_CODE': '01123', 'HS2017_CODE': '100111', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 1})
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자
{'HS2017_CODE': '100111',
 'ISIC4_CODE': '0111',
 'KSIC10_CODE': '01123',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl'}


## customs

In [7]:
file_path = '../data/jsonl_prepro_customs.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
customs_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['KOR_NAME']}\r\n{data['ENG_NAME']}\r\n{data['INT_NAME']}", # 한글품목명, 영어품목명, 성질 통합 분류명 순으로 작성됨
        metadata={
            'HS_CODE': data['HS_CODE'],
            'INT_CODE': data['INT_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl',
            'seq_num': seq_num,
        }
    )
    customs_documents.append(doc)
    seq_num += 1

pprint(customs_documents[0])
print(customs_documents[0].page_content)
pprint(customs_documents[0].metadata)

Document(page_content='농가 사육용\r\nFor farm breeding\r\n말', metadata={'HS_CODE': '0101211000', 'INT_CODE': '11020101', 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl', 'seq_num': 1})
농가 사육용
For farm breeding
말
{'HS_CODE': '0101211000',
 'INT_CODE': '11020101',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl'}


# Text Splitter

## statis

In [8]:
# content 길이 확인
length = []
content = []
for idx in range(0, len(statis_documents)):
    cond = statis_documents[idx].page_content
    a = len(cond)
    b = cond
    length.append(a)
    content.append(b)

df = pd.DataFrame({
    'length': length,
    'content': content
})

In [9]:
df.head()

Unnamed: 0,length,content
0,38,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자"
1,8,\r\n \r\n종자
2,43,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n곡물 및 기타 식량작물 재배업\r\n기타"
3,8,\r\n \r\n기타
4,39,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자용"


In [10]:
df.describe()

Unnamed: 0,length
count,8519.0
mean,31.357554
std,28.675302
min,7.0
25%,13.0
50%,24.0
75%,40.0
max,299.0


In [11]:
df[df['length'] == 7].reset_index().loc[0, 'content']

' \r\n \r\n '

In [12]:
# Split
splitter = RecursiveCharacterTextSplitter(
    separators=['\r\n', '. ', ', ', ' ', ''],
    chunk_size=100,
    chunk_overlap=0,
    length_function=len,
)
statis_splits = splitter.split_documents(statis_documents)

print(f'> Text Splitter 적용 전 문서 개수: {len(statis_documents)}\n> Text Splitter 적용 후 문서 개수: {len(statis_splits)}')
print(f'\n\nText Splitter 적용 전 page_content:\n{statis_documents[0].page_content}\n\nText Splitter 적용 후 page_content:\n{statis_splits[0].page_content}')

> Text Splitter 적용 전 문서 개수: 8519
> Text Splitter 적용 후 문서 개수: 8373


Text Splitter 적용 전 page_content:
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자

Text Splitter 적용 후 page_content:
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자


## customs

In [13]:
# content 길이 확인
length = []
content = []
for idx in range(0, len(customs_documents)):
    cond = customs_documents[idx].page_content
    a = len(cond)
    b = cond
    length.append(a)
    content.append(b)

df = pd.DataFrame({
    'length': length,
    'content': content
})

In [14]:
df.head()

Unnamed: 0,length,content
0,28,농가 사육용\r\nFor farm breeding\r\n말
1,12,기타\r\nOther\r\n말
2,25,경주말\r\nHorses for racing\r\n말
3,12,기타\r\nOther\r\n말
4,19,당나귀\r\nAsses\r\n기타 산 동물


In [15]:
df.describe()

Unnamed: 0,length
count,12422.0
mean,47.491547
std,49.001142
min,10.0
25%,19.0
50%,32.0
75%,56.0
max,869.0


In [16]:
df[df['length'] >= 800].reset_index().loc[0, 'content']

'틸라피아[오레오크로미스(Oreochromis)속], 메기[판가시우스(Pangasius)속ㆍ실루러스(Silurus)속ㆍ클라리아스(Clarias)속ㆍ익타루러스(Ictalurus)속], 잉어[사이프리너스(Cyprinus)속ㆍ카라시우스(Carassius)속ㆍ크테노파린고돈 이델루스(Ctenopharyngodon idellus)ㆍ하이포프탈미크티스(Hypophthalmichthys)속ㆍ시리누스(Cirrhinus)속ㆍ마일로파린고돈 피세우스(Mylopharyngodon piceus)ㆍ카틀라 카틀라(Catla catla)ㆍ라베오(Labeo)속ㆍ오스테오킬루스 하셀티(Osteochilus hasselti)ㆍ렙토바르부스 호에베니(Leptobarbus hoeveni)ㆍ메갈로브라마(Megalobrama)속], 뱀장어[앙귈라(Anguilla)속], 나일 퍼치[라테스 니로티쿠스(Lates niloticus)], 가물치[카나(Channa)속]\r\nTilapias (Oreochromis spp.), catfish (Pangasius spp., Silurus spp., Clarias spp., Ictalurus spp.), carp (Cyprinus spp., Carassius spp., Ctenopharyngodon idellus, Hypophthalmichthys spp., Cirrhinus spp., Mylopharyngodon piceus, Catla catla, Labeo spp., Osteochilus hasselti, Leptobarbus hoeveni, Megalobrama spp.), eels (Anguilla spp.), Nile perch (Lates niloticus) and snakeheads (Channa spp.)\r\n기타 어류(훈제)'

In [17]:
# Split
splitter = RecursiveCharacterTextSplitter(
    separators=['\r\n', '. ', ', ', ' ', ''],
    chunk_size=70,
    chunk_overlap=0,
    length_function=len,
)
customs_splits = splitter.split_documents(customs_documents)

print(f'> Text Splitter 적용 전 문서 개수: {len(customs_documents)}\n> Text Splitter 적용 후 문서 개수: {len(customs_splits)}')
print(f'\n\nText Splitter 적용 전 page_content:\n{customs_documents[0].page_content}\n\nText Splitter 적용 후 page_content:\n{customs_splits[0].page_content}')

> Text Splitter 적용 전 문서 개수: 12422
> Text Splitter 적용 후 문서 개수: 17076


Text Splitter 적용 전 page_content:
농가 사육용
For farm breeding
말

Text Splitter 적용 후 page_content:
농가 사육용
For farm breeding
말


# 벡터스토어 생성

> 통계청, 관세청만 해당함 (텍스트는 인풋 값이어서 벡터스토어에 안 넣음)

In [37]:
# Embedding
embeddings = OpenAIEmbeddings(
    api_key=OPENAI_API_KEY,
    model='text-embedding-ada-002'
)
# text-embedding-3-large, text-embedding-3-small, text-embedding-ada-002

## statis

In [38]:
name = 'statis'
folder_path = f'./vectorstore/EXP03/{name}'
if not os.path.exists(folder_path):
    print(f'> "{folder_path}" 생성 중')
    statis_vectorstore = FAISS.from_documents(
        documents=statis_splits,
        embedding=embeddings,
    )
    statis_vectorstore.save_local(folder_path=folder_path)
    print(f'> "{folder_path}" 생성 및 로컬 저장 완료')
else:
    statis_vectorstore = FAISS.load_local(
        folder_path=folder_path, 
        embeddings=embeddings, 
        allow_dangerous_deserialization=True
    )
    print(f'> "{folder_path}" 로컬에서 불러옴')

> "./faiss_statis_EXP03" 로컬에서 불러옴


In [39]:
statis_faiss_retriever = statis_vectorstore.as_retriever(k=5)

## customs

In [40]:
name = 'customs'
folder_path = f'./vectorstore/EXP03/{name}'
if not os.path.exists(folder_path):
    print(f'> "{folder_path}" 생성 중')
    customs_vectorstore = FAISS.from_documents(
        documents=customs_splits,
        embedding=embeddings,
    )
    customs_vectorstore.save_local(folder_path=folder_path)
    print(f'> "{folder_path}" 생성 및 로컬 저장 완료')
else:
    customs_vectorstore = FAISS.load_local(
        folder_path=folder_path, 
        embeddings=embeddings, 
        allow_dangerous_deserialization=True
    )
    print(f'> "{folder_path}" 로컬에서 불러옴')

> "./faiss_customs_EXP03" 로컬에서 불러옴


In [41]:
customs_faiss_retriever = customs_vectorstore.as_retriever(k=5)

# bm25

In [42]:
statis_bm25_retriever = BM25Retriever.from_documents(
    documents=statis_splits
)
statis_bm25_retriever.k = 5

In [43]:
customs_bm25_retriever = BM25Retriever.from_documents(
    documents=customs_splits
)
customs_bm25_retriever.k = 5

# 적절한 HS CODE 찾는 프로세스

> 텍스트의 jsonl 한 줄 들어옴

> 텍스트의 ISIC4와 통계청의 ISIC4 같은거 찾기 (metadata 끼리 비교)<br>근데 ISIC4_CODE 결측치 존재함.<br>텍스트가 답인 것 같음! 유사도 검색 수행도 해서 비교하기

> 조건 거친 통계청의 page_content와(topk(아마 k=5 예상)) 텍스트의 page_content를 컨텍스트로 주고, 관세청의 page_content와 비교

> 관세청 HS_CODE topk(k >= 10) 추출

> 위 과정에서 레퍼런스 잘 챙기기 

## Emsemble: 통계청

In [59]:
query = text_documents[1].page_content
print(query)
print(text_documents[1].metadata['CODE'])

general farms, primarily animals, nsk  derives 50 percent or more of its total value  of sales of agricultural products from livestock and animal specialties and their products, but less than 50 percent from products of any single three-digit industry group. crop and animal production general farms, primarily animals all other animal production mixed farming raising of other animals mixed farmingother livestock farming not elsewhere classified
0149


In [67]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[statis_bm25_retriever, statis_faiss_retriever],
    weights=[0.3, 0.7],
)

In [68]:
ensemble_result = ensemble_retriever.invoke(query)
bm25_result = statis_bm25_retriever.invoke(query)
faiss_result = statis_faiss_retriever.invoke(query)

In [69]:
print("[Ensemble Retriever]")
pprint(ensemble_result)
print('\n')
print("[BM25 Retriever]")
pprint(bm25_result)
print('\n')
print("[FAISS Retriever]")
pprint(faiss_result)

[Ensemble Retriever]
[Document(page_content='낙타과 동물 사육업\r\n그 외 기타 축산업\r\n기타', metadata={'ISIC4_CODE': '0143', 'KSIC10_CODE': '01299', 'HS2017_CODE': '510219', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 367}),
 Document(page_content='기타 축산업\r\n기타 가금류 및 조류 사육업\r\n맹금류', metadata={'ISIC4_CODE': '0149', 'KSIC10_CODE': '01239', 'HS2017_CODE': '010631', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 325}),
 Document(page_content='기타 축산업\r\n그 외 기타 축산업\r\n따로 분류되지 않은 식용인 동물성 생산품', metadata={'ISIC4_CODE': '0149', 'KSIC10_CODE': '01299', 'HS2017_CODE': '041000', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 362}),
 Document(page_content='가금류 사육업\r\n기타 가금류 및 조류 사육업\r\n기타', metadata={'ISIC4_CODE': '0146', 'KSIC10_CODE': '01239', 'HS2017_CODE': '040729', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 349}),
 Document(page_content='표백하지 않은 것', metadat

## Emsemble: 통계청

In [70]:
query = text_documents[1].page_content
print(query)
print(text_documents[1].metadata['CODE'])

general farms, primarily animals, nsk  derives 50 percent or more of its total value  of sales of agricultural products from livestock and animal specialties and their products, but less than 50 percent from products of any single three-digit industry group. crop and animal production general farms, primarily animals all other animal production mixed farming raising of other animals mixed farmingother livestock farming not elsewhere classified
0149


In [71]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[customs_bm25_retriever, customs_faiss_retriever],
    weights=[0.3, 0.7],
)

In [72]:
ensemble_result = ensemble_retriever.invoke(query)
bm25_result = customs_bm25_retriever.invoke(query)
faiss_result = customs_faiss_retriever.invoke(query)

In [74]:
print("[Ensemble Retriever]")
for res in ensemble_result:
    print(res.page_content)
print('\n')
print("[BM25 Retriever]")
for res in bm25_result:
    print(res.page_content)
print('\n')
print("[FAISS Retriever]")
for res in faiss_result:
    print(res.page_content)

[Ensemble Retriever]
Fertilisers produced by the mixing or chemical treatment of animal
agricultural purposes
Agricultural fertilizers or for manufacturing agricultural
Of machinery for the extraction or preparation of animal or fixed
Dead animals, other than products of dead animals of Chapter 3
Posts and beams other than products of subheadings 4418.81 to
Cobalt mattes and other intermediate products of cobalt metallurgy
, tubs and other coopers' products and parts thereof, of wood
diffusion and oxidation furnaces for production of semiconductor


[BM25 Retriever]
Dead animals, other than products of dead animals of Chapter 3
Posts and beams other than products of subheadings 4418.81 to
Cobalt mattes and other intermediate products of cobalt metallurgy
, tubs and other coopers' products and parts thereof, of wood
diffusion and oxidation furnaces for production of semiconductor


[FAISS Retriever]
Fertilisers produced by the mixing or chemical treatment of animal
agricultural purposes