In [66]:
from dotenv import load_dotenv
load_dotenv()

import os
import re
import csv
import json
import time
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm

from openai import OpenAI

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_upstage import ChatUpstage
from langchain.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [13]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
OPENAI_ORG_KEY = os.environ.get('OPENAI_ORG_KEY')
UPSTAGE_API_KEY = os.environ.get('UPSTAGE_API_KEY')
LANGCHAIN_API_KEY = os.environ.get('LANGCHAIN_API_KEY')
# os.environ['LANGCHAIN_PROJECT'] = 'matching_model_EXP06' # 프로젝트명 수정
# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
LANGCHAIN_PROJECT = os.environ.get('LANGCHAIN_PROJECT')

print(f'> LangSmith Project: {LANGCHAIN_PROJECT}')

> LangSmith Project: LangSmith에_표기할_프로젝트명


# 데이터 구성

> 전처리, csv to jsonl

In [3]:
# 데이터 로드

text = pd.read_excel('../data/비식별된 해외기업별 영문 텍스트데이터.xlsx')
statis = pd.read_excel('../data/통계청 국제표준산업분류 HSCODE 6단위 매핑.xlsx')
customs = pd.read_excel('../data/관세청_HS부호_240101.xlsx')

text_copy = text.copy()
statis_copy = statis.copy()
customs_copy = customs.copy()

print('> 데이터 로드 완료')


# 데이터 전처리

def zero_input(num, x):
    if pd.isna(x):
        return np.nan
    else:
        cnt = num - len(x)
        return '0' * cnt + x
    
def re_sub(x):
    if pd.isna(x):
        return np.nan
    else:
        return re.sub(r'^\((.*?)\)$', r'\1', x)

text_copy['ID'] = text_copy['ID'].astype(str)
text_copy['CODE'] = text_copy['CODE'].astype(str)
text_copy['CODE'] = text_copy['CODE'].apply(lambda x: zero_input(4, x))

statis_copy.columns = [
    'ISIC4_CODE', # ISIC4_국제표준산업분류
    'ISIC4_NAME', # ISIC4_분류명
    'KSIC10_CODE', # KSIC10_한국표준산업분류
    'KSIC10_NAME', # KSIC10_분류명
    'HS2017_CODE', # HS2017_관세통계통합품목분류
    'HS2017_NAME' # HS2017_분류명
]

statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].astype(str)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].replace('nan', np.nan)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].str.replace('.0', '', regex=False)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].apply(lambda x: zero_input(4, x))

statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].astype(str)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].replace('nan', np.nan)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].str.replace('.0', '', regex=False)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].apply(lambda x: zero_input(6, x))

customs_copy.columns = [
    'HS_CODE', # HS부호
    'KOR_NAME', # 한글품목명
    'ENG_NAME', # 영문품목명
    'INT_CODE', # 성질통합분류코드
    'INT_NAME' # 성질통합분류명
]

customs_copy['HS_CODE'] = customs_copy['HS_CODE'].astype(str)
customs_copy['HS_CODE'] = customs_copy['HS_CODE'].apply(lambda x: zero_input(10, x))

customs_copy['INT_CODE'] = customs_copy['INT_CODE'].astype(str)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].replace('nan', np.nan)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].str.replace('.0', '', regex=False)

customs_copy['INT_NAME'] = customs_copy['INT_NAME'].apply(lambda x: re_sub(x))

text_copy = text_copy.fillna(' ')
statis_copy = statis_copy.fillna(' ')
customs_copy = customs_copy.fillna(' ')

print('> 데이터 전처리 완료')
print('> 데이터 결측치 확인')
print('-----' * 5)
print(text_copy.isnull().sum())
print(statis_copy.isnull().sum())
print(customs_copy.isnull().sum())
print('-----' * 5)


# 데이터 저장 및 로드

text_copy.to_csv('../data/prepro_text.csv', index=False, encoding='utf-8')
statis_copy.to_csv('../data/prepro_statis.csv', index=False, encoding='utf-8')
customs_copy.to_csv('../data/prepro_customs.csv', index=False, encoding='utf-8')

text_prepro = pd.read_csv('../data/prepro_text.csv', dtype=str)
statis_prepro = pd.read_csv('../data/prepro_statis.csv', dtype=str)
customs_prepro = pd.read_csv('../data/prepro_customs.csv', dtype=str)


# csv to jsonl

def csv_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        with open(jsonl_file_path, mode='w', encoding='utf-8') as jsonl_file:
            for row in csv_reader:
                jsonl_file.write(json.dumps(row, ensure_ascii=False) + '\n')

csv_to_jsonl('../data/prepro_text.csv', '../data/jsonl_prepro_text.jsonl')
csv_to_jsonl('../data/prepro_statis.csv', '../data/jsonl_prepro_statis.jsonl')
csv_to_jsonl('../data/prepro_customs.csv', '../data/jsonl_prepro_customs.jsonl')
print('> csv to jsonl 완료')

> 데이터 로드 완료
> 데이터 전처리 완료
> 데이터 결측치 확인
-------------------------
ID      0
CODE    0
DSC     0
dtype: int64
ISIC4_CODE     0
ISIC4_NAME     0
KSIC10_CODE    0
KSIC10_NAME    0
HS2017_CODE    0
HS2017_NAME    0
dtype: int64
HS_CODE     0
KOR_NAME    0
ENG_NAME    0
INT_CODE    0
INT_NAME    0
dtype: int64
-------------------------
> csv to jsonl 완료


# Document 구성

## text

In [4]:
file_path = '../data/jsonl_prepro_text.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
text_documents = []
for data in temp:
    doc = Document(
        page_content=data['DSC'], 
        metadata={
            'ID': data['ID'],
            'CODE': data['CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl',
            'seq_num': seq_num,
        }
    )
    text_documents.append(doc)
    seq_num += 1

pprint(text_documents[0])
print(text_documents[0].page_content)
pprint(text_documents[0].metadata)

Document(page_content='automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of motor vehicles maintenance and repair of motor vehicles maintenance and repair of motor vehiclesother automotive repair and maintenance', metadata={'ID': '1', 'CODE': '4520', 'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl', 'seq_num': 1})
automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of moto

## statis

In [5]:
file_path = '../data/jsonl_prepro_statis.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
statis_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['ISIC4_NAME']}\r\n{data['KSIC10_NAME']}\r\n{data['HS2017_NAME']}", # ISIC4, KSIC10, HS2017 순으로 작성됨
        metadata={
            'ISIC4_CODE': data['ISIC4_CODE'],
            'KSIC10_CODE': data['KSIC10_CODE'],
            'HS2017_CODE': data['HS2017_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl',
            'seq_num': seq_num,
        }
    )
    statis_documents.append(doc)
    seq_num += 1

pprint(statis_documents[0])
print(statis_documents[0].page_content)
pprint(statis_documents[0].metadata)

Document(page_content='곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자', metadata={'ISIC4_CODE': '0111', 'KSIC10_CODE': '01123', 'HS2017_CODE': '100111', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 1})
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자
{'HS2017_CODE': '100111',
 'ISIC4_CODE': '0111',
 'KSIC10_CODE': '01123',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl'}


## customs

In [6]:
file_path = '../data/jsonl_prepro_customs.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
customs_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['KOR_NAME']}\r\n{data['ENG_NAME']}\r\n{data['INT_NAME']}", # 한글품목명, 영어품목명, 성질 통합 분류명 순으로 작성됨
        metadata={
            'HS_CODE': data['HS_CODE'],
            'INT_CODE': data['INT_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl',
            'seq_num': seq_num,
        }
    )
    customs_documents.append(doc)
    seq_num += 1

pprint(customs_documents[0])
print(customs_documents[0].page_content)
pprint(customs_documents[0].metadata)

Document(page_content='농가 사육용\r\nFor farm breeding\r\n말', metadata={'HS_CODE': '0101211000', 'INT_CODE': '11020101', 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl', 'seq_num': 1})
농가 사육용
For farm breeding
말
{'HS_CODE': '0101211000',
 'INT_CODE': '11020101',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl'}


# OpenAI Summarization + Translation

In [41]:
text_dup = text_prepro.drop_duplicates(subset='DSC').reset_index(drop=True)
text_dup

Unnamed: 0,ID,CODE,DSC
0,1,4520,"automotive repair shops, nec specialized auto..."
1,2,0149,"general farms, primarily animals, nsk derives..."
2,3,4630,fish and seafoods the wholesale distribution ...
3,4,4510,"new and used car dealers, nsk manufactures a ..."
4,4,2930,"automotive stampings, nsk manufacturing autom..."
...,...,...,...
1713,10634,4669,"printing and writing paper, nsk the wholesale..."
1714,10635,1410,leather and sheep-lined clothing manufacturin...
1715,10636,4652,"electronic parts and equipment, nec, nsk the ..."
1716,10642,1104,"bottled and canned soft drinks, nsk manufactu..."


In [23]:
def summary_prompt(text):
    prompt = (
        "Please summarize the following English text, ensuring that the summary captures the main points and key information from the original text:\n\n"
        "{text}\n\n"
        "Then translate the summary into Korean.\n\n"
        "Desired output format:\n"
        "Summary: <English summary>\n"
        "Korean: <Korean translation>"
    )
    return prompt.format(text=text)

def get_summary(text):
    prompt = summary_prompt(text=text)
    completion = client.chat.completions.create(
        model='gpt-3.5-turbo-0125',
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    return completion.choices[0].message.content

client = OpenAI(
    api_key=OPENAI_API_KEY,
    organization=OPENAI_ORG_KEY
)

file_path = '../data/jsonl_summary_text.jsonl'
with open(file_path, 'a', encoding='utf-8') as file:
    for idx in range(text_dup.shape[0]):
        text = text_dup.loc[idx, 'DSC']
        response = get_summary(text=text)

        output = {
            "ID": text_dup.loc[idx, 'ID'], 
            "CODE": text_dup.loc[idx, 'CODE'], 
            "DSC": response
        }
        file.write(f'{json.dumps(output, ensure_ascii=False)}\n')

        print('-----' * 7)
        print(f'{idx}.\nOrginal: {text}\n{response}')

-----------------------------------
0.
Orginal: automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of motor vehicles maintenance and repair of motor vehicles maintenance and repair of motor vehiclesother automotive repair and maintenance
Summary: The text describes various specialized automotive repair services such as fuel service, carburetor repair, brake relining, wheel alignment, and radiator repair, as well as general motor vehicle repair and maintenance.

Korean: 이 텍스트는 연료 서비스, 기화기 수리, 브레이크 재설치, 휠 정렬 및 라디에이터 수리와 같은 다양한 특수 자동차 수리 서비스와 일반 자동차 수리 및 유지보수에 대해 설명하고 있습니다.
-----------------------------------
1.
Orginal: general farms, primarily animals, nsk  derives 50 percent or more of its

In [43]:
text_dup_sum_enko = text_dup.copy()

data = []
with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line.strip()))
df = pd.DataFrame(data)

cond = df['DSC'].apply(lambda x: x.split('\n\nKorean: '))
for idx in range(len(cond)):
    text_dup_sum_enko.loc[idx, 'DSC_summary'] = cond[idx][0].strip('Summary: ')
    text_dup_sum_enko.loc[idx, 'DSC_enko'] = cond[idx][1]
text_dup_sum_enko.head()

Unnamed: 0,ID,CODE,DSC,DSC_summary,DSC_enko
0,1,4520,"automotive repair shops, nec specialized auto...",The text describes various specialized automot...,"이 텍스트는 연료 서비스, 기화기 수리, 브레이크 재설치, 휠 정렬 및 라디에이터 ..."
1,2,149,"general farms, primarily animals, nsk derives...",The text describes general farms that primaril...,"이 텍스트는 주로 가축 생산에 중점을 둔 일반 농장을 설명하며, 이러한 농장들이 가..."
2,3,4630,fish and seafoods the wholesale distribution ...,The text discusses the wholesale distribution ...,"이 텍스트는 살아 있는, 처리된 또는 냉동된 생선과 해산물의 도매 유통에 대해 다루..."
3,4,4510,"new and used car dealers, nsk manufactures a ...",The company manufactures vehicles under Fiat a...,회사는 미국과 멕시코의 다수 공장에서 피아트와 크라이슬러 브랜드를 통해 자동차를 생...
4,4,2930,"automotive stampings, nsk manufacturing autom...",The text discusses the manufacturing of automo...,"요약: 이 텍스트는 차량 부품인 바디 부품, 허브 및 트림을 비롯한 자동차 스탬핑의..."


In [44]:
text_sum_enko = text_prepro.copy()

for i in tqdm(range(text_sum_enko.shape[0])):
    for j in range(text_dup_sum_enko.shape[0]):
        if text_sum_enko.loc[i, 'DSC'] == text_dup_sum_enko.loc[j, 'DSC']:
            text_sum_enko.loc[i, 'DSC_summary'] = text_dup_sum_enko.loc[j, 'DSC_summary']
            text_sum_enko.loc[i, 'DSC_enko'] = text_dup_sum_enko.loc[j, 'DSC_enko']
text_sum_enko.head()

100%|██████████| 10000/10000 [02:55<00:00, 56.83it/s]


Unnamed: 0,ID,CODE,DSC,DSC_summary,DSC_enko
0,1,4520,"automotive repair shops, nec specialized auto...",The text describes various specialized automot...,"이 텍스트는 연료 서비스, 기화기 수리, 브레이크 재설치, 휠 정렬 및 라디에이터 ..."
1,2,149,"general farms, primarily animals, nsk derives...",The text describes general farms that primaril...,"이 텍스트는 주로 가축 생산에 중점을 둔 일반 농장을 설명하며, 이러한 농장들이 가..."
2,3,4630,fish and seafoods the wholesale distribution ...,The text discusses the wholesale distribution ...,"이 텍스트는 살아 있는, 처리된 또는 냉동된 생선과 해산물의 도매 유통에 대해 다루..."
3,4,4510,"new and used car dealers, nsk manufactures a ...",The company manufactures vehicles under Fiat a...,회사는 미국과 멕시코의 다수 공장에서 피아트와 크라이슬러 브랜드를 통해 자동차를 생...
4,4,2930,"automotive stampings, nsk manufacturing autom...",The text discusses the manufacturing of automo...,"요약: 이 텍스트는 차량 부품인 바디 부품, 허브 및 트림을 비롯한 자동차 스탬핑의..."


# Document 구성

## text

In [59]:
seq_num = 1
text_documents = []
for idx in range(text_sum_enko.shape[0]):
    doc = Document(
        page_content=f"{text_sum_enko.loc[idx, 'DSC_summary']}\n{text_sum_enko.loc[idx, 'DSC_enko']}", 
        metadata={
            'ID': text_sum_enko.loc[idx, 'ID'],
            'CODE': text_sum_enko.loc[idx, 'CODE'],
            'source': '/root/contest-matching-model/data/jsonl_summary_text.jsonl',
            'seq_num': seq_num,
        }
    )
    text_documents.append(doc)
    seq_num += 1

pprint(text_documents[0])
print(text_documents[0].page_content)
pprint(text_documents[0].metadata)

Document(page_content='The text describes various specialized automotive repair services such as fuel service, carburetor repair, brake relining, wheel alignment, and radiator repair, as well as general motor vehicle repair and maintenance.\n이 텍스트는 연료 서비스, 기화기 수리, 브레이크 재설치, 휠 정렬 및 라디에이터 수리와 같은 다양한 특수 자동차 수리 서비스와 일반 자동차 수리 및 유지보수에 대해 설명하고 있습니다.', metadata={'ID': '1', 'CODE': '4520', 'source': '/root/contest-matching-model/data/jsonl_summary_text.jsonl', 'seq_num': 1})
The text describes various specialized automotive repair services such as fuel service, carburetor repair, brake relining, wheel alignment, and radiator repair, as well as general motor vehicle repair and maintenance.
이 텍스트는 연료 서비스, 기화기 수리, 브레이크 재설치, 휠 정렬 및 라디에이터 수리와 같은 다양한 특수 자동차 수리 서비스와 일반 자동차 수리 및 유지보수에 대해 설명하고 있습니다.
{'CODE': '4520',
 'ID': '1',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_summary_text.jsonl'}


## statis

In [53]:
file_path = '../data/jsonl_prepro_statis.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
statis_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['ISIC4_NAME']}\r\n{data['KSIC10_NAME']}\r\n{data['HS2017_NAME']}", # ISIC4, KSIC10, HS2017 순으로 작성됨
        metadata={
            'ISIC4_CODE': data['ISIC4_CODE'],
            'KSIC10_CODE': data['KSIC10_CODE'],
            'HS2017_CODE': data['HS2017_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl',
            'seq_num': seq_num,
        }
    )
    statis_documents.append(doc)
    seq_num += 1

pprint(statis_documents[0])
print(statis_documents[0].page_content)
pprint(statis_documents[0].metadata)

Document(page_content='곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자', metadata={'ISIC4_CODE': '0111', 'KSIC10_CODE': '01123', 'HS2017_CODE': '100111', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 1})
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자
{'HS2017_CODE': '100111',
 'ISIC4_CODE': '0111',
 'KSIC10_CODE': '01123',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl'}


## customs

In [54]:
file_path = '../data/jsonl_prepro_customs.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
customs_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['KOR_NAME']}\r\n{data['ENG_NAME']}\r\n{data['INT_NAME']}", # 한글품목명, 영어품목명, 성질 통합 분류명 순으로 작성됨
        metadata={
            'HS_CODE': data['HS_CODE'],
            'INT_CODE': data['INT_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl',
            'seq_num': seq_num,
        }
    )
    customs_documents.append(doc)
    seq_num += 1

pprint(customs_documents[0])
print(customs_documents[0].page_content)
pprint(customs_documents[0].metadata)

Document(page_content='농가 사육용\r\nFor farm breeding\r\n말', metadata={'HS_CODE': '0101211000', 'INT_CODE': '11020101', 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl', 'seq_num': 1})
농가 사육용
For farm breeding
말
{'HS_CODE': '0101211000',
 'INT_CODE': '11020101',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl'}


# Text Splitter

## statis

In [55]:
# content 길이 확인
length = []
content = []
for idx in range(0, len(statis_documents)):
    cond = statis_documents[idx].page_content
    a = len(cond)
    b = cond
    length.append(a)
    content.append(b)

df = pd.DataFrame({
    'length': length,
    'content': content
})

In [56]:
df.head()

Unnamed: 0,length,content
0,38,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자"
1,8,\r\n \r\n종자
2,43,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n곡물 및 기타 식량작물 재배업\r\n기타"
3,8,\r\n \r\n기타
4,39,"곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자용"


In [57]:
df.describe()

Unnamed: 0,length
count,8519.0
mean,31.357554
std,28.675302
min,7.0
25%,13.0
50%,24.0
75%,40.0
max,299.0


In [58]:
df[df['length'] == 7].reset_index().loc[0, 'content']

' \r\n \r\n '

In [60]:
# # Split
# splitter = RecursiveCharacterTextSplitter(
#     separators=['\r\n', '. ', ', ', ' ', ''],
#     chunk_size=100,
#     chunk_overlap=0,
#     length_function=len,
# )
# statis_splits = splitter.split_documents(statis_documents)

# print(f'> Text Splitter 적용 전 문서 개수: {len(statis_documents)}\n> Text Splitter 적용 후 문서 개수: {len(statis_splits)}')
# print(f'\n\nText Splitter 적용 전 page_content:\n{statis_documents[0].page_content}\n\nText Splitter 적용 후 page_content:\n{statis_splits[0].page_content}')

## customs

In [61]:
# content 길이 확인
length = []
content = []
for idx in range(0, len(customs_documents)):
    cond = customs_documents[idx].page_content
    a = len(cond)
    b = cond
    length.append(a)
    content.append(b)

df = pd.DataFrame({
    'length': length,
    'content': content
})

In [62]:
df.head()

Unnamed: 0,length,content
0,28,농가 사육용\r\nFor farm breeding\r\n말
1,12,기타\r\nOther\r\n말
2,25,경주말\r\nHorses for racing\r\n말
3,12,기타\r\nOther\r\n말
4,19,당나귀\r\nAsses\r\n기타 산 동물


In [63]:
df.describe()

Unnamed: 0,length
count,12422.0
mean,47.491547
std,49.001142
min,10.0
25%,19.0
50%,32.0
75%,56.0
max,869.0


In [64]:
df[df['length'] >= 800].reset_index().loc[0, 'content']

'틸라피아[오레오크로미스(Oreochromis)속], 메기[판가시우스(Pangasius)속ㆍ실루러스(Silurus)속ㆍ클라리아스(Clarias)속ㆍ익타루러스(Ictalurus)속], 잉어[사이프리너스(Cyprinus)속ㆍ카라시우스(Carassius)속ㆍ크테노파린고돈 이델루스(Ctenopharyngodon idellus)ㆍ하이포프탈미크티스(Hypophthalmichthys)속ㆍ시리누스(Cirrhinus)속ㆍ마일로파린고돈 피세우스(Mylopharyngodon piceus)ㆍ카틀라 카틀라(Catla catla)ㆍ라베오(Labeo)속ㆍ오스테오킬루스 하셀티(Osteochilus hasselti)ㆍ렙토바르부스 호에베니(Leptobarbus hoeveni)ㆍ메갈로브라마(Megalobrama)속], 뱀장어[앙귈라(Anguilla)속], 나일 퍼치[라테스 니로티쿠스(Lates niloticus)], 가물치[카나(Channa)속]\r\nTilapias (Oreochromis spp.), catfish (Pangasius spp., Silurus spp., Clarias spp., Ictalurus spp.), carp (Cyprinus spp., Carassius spp., Ctenopharyngodon idellus, Hypophthalmichthys spp., Cirrhinus spp., Mylopharyngodon piceus, Catla catla, Labeo spp., Osteochilus hasselti, Leptobarbus hoeveni, Megalobrama spp.), eels (Anguilla spp.), Nile perch (Lates niloticus) and snakeheads (Channa spp.)\r\n기타 어류(훈제)'

In [65]:
# # Split
# splitter = RecursiveCharacterTextSplitter(
#     separators=['\r\n', '. ', ', ', ' ', ''],
#     chunk_size=70,
#     chunk_overlap=0,
#     length_function=len,
# )
# customs_splits = splitter.split_documents(customs_documents)

# print(f'> Text Splitter 적용 전 문서 개수: {len(customs_documents)}\n> Text Splitter 적용 후 문서 개수: {len(customs_splits)}')
# print(f'\n\nText Splitter 적용 전 page_content:\n{customs_documents[0].page_content}\n\nText Splitter 적용 후 page_content:\n{customs_splits[0].page_content}')

# 벡터스토어 생성

> 통계청, 관세청만 해당함 (텍스트는 인풋 값이어서 벡터스토어에 안 넣음)

In [67]:
# Embedding
embeddings = OpenAIEmbeddings(
    api_key=OPENAI_API_KEY,
    model='text-embedding-3-large'
)

## statis

In [68]:
name = 'statis'
folder_path = f'./vectorstore/EXP05/{name}'
if not os.path.exists(folder_path):
    print(f'> "{folder_path}" 생성 중')
    statis_vectorstore = FAISS.from_documents(
        documents=statis_documents,
        embedding=embeddings,
    )
    statis_vectorstore.save_local(folder_path=folder_path)
    print(f'> "{folder_path}" 생성 및 로컬 저장 완료')
else:
    statis_vectorstore = FAISS.load_local(
        folder_path=folder_path, 
        embeddings=embeddings, 
        allow_dangerous_deserialization=True
    )
    print(f'> "{folder_path}" 로컬에서 불러옴')

> "./vectorstore/EXP05/statis" 생성 중
> "./vectorstore/EXP05/statis" 생성 및 로컬 저장 완료


In [87]:
statis_retriever = statis_vectorstore.as_retriever(search_kwargs={'k': 3})

## customs

In [88]:
name = 'customs'
folder_path = f'./vectorstore/EXP05/{name}'
if not os.path.exists(folder_path):
    print(f'> "{folder_path}" 생성 중')
    customs_vectorstore = FAISS.from_documents(
        documents=customs_documents,
        embedding=embeddings,
    )
    customs_vectorstore.save_local(folder_path=folder_path)
    print(f'> "{folder_path}" 생성 및 로컬 저장 완료')
else:
    customs_vectorstore = FAISS.load_local(
        folder_path=folder_path, 
        embeddings=embeddings, 
        allow_dangerous_deserialization=True
    )
    print(f'> "{folder_path}" 로컬에서 불러옴')

> "./vectorstore/EXP05/customs" 로컬에서 불러옴


In [89]:
customs_retriever = customs_vectorstore.as_retriever(search_kwargs={'k': 3})

# 적절한 HS CODE 찾는 프로세스

> 텍스트의 jsonl 한 줄 들어옴

> 텍스트의 ISIC4와 통계청의 ISIC4 같은거 찾기 (metadata 끼리 비교)<br>근데 ISIC4_CODE 결측치 존재함.<br>텍스트가 답인 것 같음! 유사도 검색 수행도 해서 비교하기

> 조건 거친 통계청의 page_content와(topk(아마 k=5 예상)) 텍스트의 page_content를 컨텍스트로 주고, 관세청의 page_content와 비교

> 관세청 HS_CODE topk(k >= 10) 추출

> 위 과정에서 레퍼런스 잘 챙기기 

## statis query = text

In [90]:
query = text_documents[0].page_content

statis_results = statis_retriever.invoke(query)

i = 1
statis_query = ''
print('[statis]')
for res in statis_results:
    print('-----' * 7)
    print(f'{i}.')
    print(res.page_content)
    statis_query += f'\n{res.page_content}'
    i += 1
print('-----' * 7)
print(statis_query)

[statis]
-----------------------------------
1.
자동차 정비 및 수리업
자동차 전문 수리업
 
-----------------------------------
2.
자동차 정비 및 수리업
자동차 종합 수리업
 
-----------------------------------
3.
자동차 정비 및 수리업
자동차 종합 수리업
 
-----------------------------------

자동차 정비 및 수리업
자동차 전문 수리업
 
자동차 정비 및 수리업
자동차 종합 수리업
 
자동차 정비 및 수리업
자동차 종합 수리업
 


## customs query = text

In [91]:
customs_results = customs_retriever.invoke(query)

i = 1
print('[customs]')
for res in customs_results:
    print('-----' * 7)
    print(f'{i}.')
    print(res.page_content)
    i += 1

[customs]
-----------------------------------
1.
자동차용
For automobiles
자동차 부품
-----------------------------------
2.
승용자동차용[스테이션왜건(station wagon)과 경주 자동차용을 포함한다]
Of a kind used on motor cars (including station wagons and racing cars)
고무타이어 및 타이어튜브
-----------------------------------
3.
승용자동차용[스테이션왜건(station wagon)과 경주 자동차용을 포함한다]
Of a kind used on motor cars (including station wagons and racing cars)
고무타이어 및 타이어튜브


## customs query = statis topk

In [92]:
customs_results = customs_retriever.invoke(statis_query)

i = 1
print('[customs]')
for res in customs_results:
    print('-----' * 7)
    print(f'{i}.')
    print(res.page_content)
    i += 1

[customs]
-----------------------------------
1.
자동차용
For automobiles
자동차 부품
-----------------------------------
2.
부분품
Parts
자동차 부품
-----------------------------------
3.
부분품
Parts
자동차 부품


## 전체

In [104]:
# vectorstore as retriever
statis_retriever = statis_vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 3})
customs_retriever = customs_vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 3})

# Create directory
statis_file_path = '../submit/EXP05/statis.jsonl'
os.makedirs(os.path.dirname(statis_file_path), exist_ok=True)
customs_file_path = '../submit/EXP05/customs.jsonl'
os.makedirs(os.path.dirname(customs_file_path), exist_ok=True)

# IR
for text_document in tqdm(text_documents):
    # query
    query = text_document.page_content

    # statis retriever
    statis_results = statis_retriever.invoke(query)

    # Save References and Create new query
    idx = 0
    statis_query = ''
    with open(statis_file_path, 'a', encoding='utf-8') as ref:
        statis_references = {
            "ISIC4_CODE": "", 
            "KSIC10_CODE": "", 
            "HS2017_CODE": "", 
            "result": ""
        }        
        isic4 = [reference.metadata['ISIC4_CODE'] for reference in statis_results]
        ksic10 = [reference.metadata['KSIC10_CODE'] for reference in statis_results]
        hs2017 = [reference.metadata['HS2017_CODE'] for reference in statis_results]
        result = [reference.page_content for reference in statis_results]
        statis_references['ISIC4_CODE'] = isic4
        statis_references['KSIC10_CODE'] = ksic10
        statis_references['HS2017_CODE'] = hs2017
        statis_references['result'] = result

        ref.write(f'{json.dumps(statis_references, ensure_ascii=False)}\n')

        for res in statis_results:
            statis_query += f'\n{res.page_content}'
        
        # customs retriever
        customs_results = customs_retriever.invoke(statis_query)

        # Save References
        with open(customs_file_path, 'a', encoding='utf-8') as ref:
            customs_references = {
                "HS_CODE": "", 
                "INT_CODE": "", 
                "result": ""
            }        
            hscode = [reference.metadata['HS_CODE'] for reference in customs_results]
            intcode = [reference.metadata['INT_CODE'] for reference in customs_results]
            result = [reference.page_content for reference in customs_results]
            customs_references['HS_CODE'] = hscode
            customs_references['INT_CODE'] = intcode
            customs_references['result'] = result

            ref.write(f'{json.dumps(customs_references, ensure_ascii=False)}\n')

100%|██████████| 10000/10000 [2:45:10<00:00,  1.01it/s]  
