In [13]:
from dotenv import load_dotenv
load_dotenv()

import os
import re
import csv
import json
import time
import numpy as np
import pandas as pd
from pprint import pprint
from tqdm import tqdm

from langchain import hub
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.vectorstores import FAISS
from langchain_upstage import ChatUpstage
from langchain.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [3]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
UPSTAGE_API_KEY = os.environ.get('UPSTAGE_API_KEY')
LANGCHAIN_API_KEY = os.environ.get('LANGCHAIN_API_KEY')
# os.environ['LANGCHAIN_PROJECT'] = 'matching_model_EXP06' # 프로젝트명 수정
# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
LANGCHAIN_PROJECT = os.environ.get('LANGCHAIN_PROJECT')

print(f'> LangSmith Project: {LANGCHAIN_PROJECT}')

> LangSmith Project: matching_model_EXP06


# 데이터 구성

> 전처리, csv to jsonl

In [4]:
# 데이터 로드

text = pd.read_excel('../data/비식별된 해외기업별 영문 텍스트데이터.xlsx')
statis = pd.read_excel('../data/통계청 국제표준산업분류 HSCODE 6단위 매핑.xlsx')
customs = pd.read_excel('../data/관세청_HS부호_240101.xlsx')

text_copy = text.copy()
statis_copy = statis.copy()
customs_copy = customs.copy()

print('> 데이터 로드 완료')


# 데이터 전처리

def zero_input(num, x):
    if pd.isna(x):
        return np.nan
    else:
        cnt = num - len(x)
        return '0' * cnt + x
    
def re_sub(x):
    if pd.isna(x):
        return np.nan
    else:
        return re.sub(r'^\((.*?)\)$', r'\1', x)

text_copy['ID'] = text_copy['ID'].astype(str)
text_copy['CODE'] = text_copy['CODE'].astype(str)
text_copy['CODE'] = text_copy['CODE'].apply(lambda x: zero_input(4, x))

statis_copy.columns = [
    'ISIC4_CODE', # ISIC4_국제표준산업분류
    'ISIC4_NAME', # ISIC4_분류명
    'KSIC10_CODE', # KSIC10_한국표준산업분류
    'KSIC10_NAME', # KSIC10_분류명
    'HS2017_CODE', # HS2017_관세통계통합품목분류
    'HS2017_NAME' # HS2017_분류명
]

statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].astype(str)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].replace('nan', np.nan)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].str.replace('.0', '', regex=False)
statis_copy['ISIC4_CODE'] = statis_copy['ISIC4_CODE'].apply(lambda x: zero_input(4, x))

statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].astype(str)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].replace('nan', np.nan)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].str.replace('.0', '', regex=False)
statis_copy['HS2017_CODE'] = statis_copy['HS2017_CODE'].apply(lambda x: zero_input(6, x))

customs_copy.columns = [
    'HS_CODE', # HS부호
    'KOR_NAME', # 한글품목명
    'ENG_NAME', # 영문품목명
    'INT_CODE', # 성질통합분류코드
    'INT_NAME' # 성질통합분류명
]

customs_copy['HS_CODE'] = customs_copy['HS_CODE'].astype(str)
customs_copy['HS_CODE'] = customs_copy['HS_CODE'].apply(lambda x: zero_input(10, x))

customs_copy['INT_CODE'] = customs_copy['INT_CODE'].astype(str)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].replace('nan', np.nan)
customs_copy['INT_CODE'] = customs_copy['INT_CODE'].str.replace('.0', '', regex=False)

customs_copy['INT_NAME'] = customs_copy['INT_NAME'].apply(lambda x: re_sub(x))

text_copy = text_copy.fillna(' ')
statis_copy = statis_copy.fillna(' ')
customs_copy = customs_copy.fillna(' ')

print('> 데이터 전처리 완료')
print('> 데이터 결측치 확인')
print('-----' * 5)
print(text_copy.isnull().sum())
print(statis_copy.isnull().sum())
print(customs_copy.isnull().sum())
print('-----' * 5)


# 데이터 저장 및 로드

text_copy.to_csv('../data/prepro_text.csv', index=False, encoding='utf-8')
statis_copy.to_csv('../data/prepro_statis.csv', index=False, encoding='utf-8')
customs_copy.to_csv('../data/prepro_customs.csv', index=False, encoding='utf-8')

text_prepro = pd.read_csv('../data/prepro_text.csv', dtype=str)
statis_prepro = pd.read_csv('../data/prepro_statis.csv', dtype=str)
customs_prepro = pd.read_csv('../data/prepro_customs.csv', dtype=str)


# csv to jsonl

def csv_to_jsonl(csv_file_path, jsonl_file_path):
    with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        
        with open(jsonl_file_path, mode='w', encoding='utf-8') as jsonl_file:
            for row in csv_reader:
                jsonl_file.write(json.dumps(row, ensure_ascii=False) + '\n')

csv_to_jsonl('../data/prepro_text.csv', '../data/jsonl_prepro_text.jsonl')
csv_to_jsonl('../data/prepro_statis.csv', '../data/jsonl_prepro_statis.jsonl')
csv_to_jsonl('../data/prepro_customs.csv', '../data/jsonl_prepro_customs.jsonl')
print('> csv to jsonl 완료')

> 데이터 로드 완료
> 데이터 전처리 완료
> 데이터 결측치 확인
-------------------------
ID      0
CODE    0
DSC     0
dtype: int64
ISIC4_CODE     0
ISIC4_NAME     0
KSIC10_CODE    0
KSIC10_NAME    0
HS2017_CODE    0
HS2017_NAME    0
dtype: int64
HS_CODE     0
KOR_NAME    0
ENG_NAME    0
INT_CODE    0
INT_NAME    0
dtype: int64
-------------------------
> csv to jsonl 완료


# Document 구성

## text

In [5]:
file_path = '../data/jsonl_prepro_text.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
text_documents = []
for data in temp:
    doc = Document(
        page_content=data['DSC'], 
        metadata={
            'ID': data['ID'],
            'CODE': data['CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl',
            'seq_num': seq_num,
        }
    )
    text_documents.append(doc)
    seq_num += 1

pprint(text_documents[0])
print(text_documents[0].page_content)
pprint(text_documents[0].metadata)

Document(page_content='automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of motor vehicles maintenance and repair of motor vehicles maintenance and repair of motor vehiclesother automotive repair and maintenance', metadata={'ID': '1', 'CODE': '4520', 'source': '/root/contest-matching-model/data/jsonl_prepro_text.jsonl', 'seq_num': 1})
automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of moto

## statis

In [6]:
file_path = '../data/jsonl_prepro_statis.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
statis_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['ISIC4_NAME']}\r\n{data['KSIC10_NAME']}\r\n{data['HS2017_NAME']}", # ISIC4, KSIC10, HS2017 순으로 작성됨
        metadata={
            'ISIC4_CODE': data['ISIC4_CODE'],
            'KSIC10_CODE': data['KSIC10_CODE'],
            'HS2017_CODE': data['HS2017_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl',
            'seq_num': seq_num,
        }
    )
    statis_documents.append(doc)
    seq_num += 1

pprint(statis_documents[0])
print(statis_documents[0].page_content)
pprint(statis_documents[0].metadata)

Document(page_content='곡물(쌀 제외), 콩류, 종실유 재배업\r\n종자 및 묘목 생산업\r\n종자', metadata={'ISIC4_CODE': '0111', 'KSIC10_CODE': '01123', 'HS2017_CODE': '100111', 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl', 'seq_num': 1})
곡물(쌀 제외), 콩류, 종실유 재배업
종자 및 묘목 생산업
종자
{'HS2017_CODE': '100111',
 'ISIC4_CODE': '0111',
 'KSIC10_CODE': '01123',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_statis.jsonl'}


## customs

In [7]:
file_path = '../data/jsonl_prepro_customs.jsonl'
temp = []
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        temp.append(json.loads(line.strip()))

seq_num = 1
customs_documents = []
for data in temp:
    doc = Document(
        page_content=f"{data['KOR_NAME']}\r\n{data['ENG_NAME']}\r\n{data['INT_NAME']}", # 한글품목명, 영어품목명, 성질 통합 분류명 순으로 작성됨
        metadata={
            'HS_CODE': data['HS_CODE'],
            'INT_CODE': data['INT_CODE'],
            'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl',
            'seq_num': seq_num,
        }
    )
    customs_documents.append(doc)
    seq_num += 1

pprint(customs_documents[0])
print(customs_documents[0].page_content)
pprint(customs_documents[0].metadata)

Document(page_content='농가 사육용\r\nFor farm breeding\r\n말', metadata={'HS_CODE': '0101211000', 'INT_CODE': '11020101', 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl', 'seq_num': 1})
농가 사육용
For farm breeding
말
{'HS_CODE': '0101211000',
 'INT_CODE': '11020101',
 'seq_num': 1,
 'source': '/root/contest-matching-model/data/jsonl_prepro_customs.jsonl'}


# English-to-Korean translation

In [28]:
text_dup = text_prepro.drop_duplicates(subset='DSC').reset_index(drop=True)
text_dup

Unnamed: 0,ID,CODE,DSC
0,1,4520,"automotive repair shops, nec specialized auto..."
1,2,0149,"general farms, primarily animals, nsk derives..."
2,3,4630,fish and seafoods the wholesale distribution ...
3,4,4510,"new and used car dealers, nsk manufactures a ..."
4,4,2930,"automotive stampings, nsk manufacturing autom..."
...,...,...,...
1713,10634,4669,"printing and writing paper, nsk the wholesale..."
1714,10635,1410,leather and sheep-lined clothing manufacturin...
1715,10636,4652,"electronic parts and equipment, nec, nsk the ..."
1716,10642,1104,"bottled and canned soft drinks, nsk manufactu..."


In [36]:
chat = ChatUpstage(
    api_key=UPSTAGE_API_KEY, 
    model="solar-1-mini-translate-enko"
)

file_path = '../data/jsonl_enko_text.jsonl'
with open(file_path, 'a') as file:
    for idx in range(text_dup.shape[0]):
        message = text_dup.loc[idx, 'DSC']
        response = chat.invoke(message)

        output = {
            "ID": text_dup.loc[idx, 'ID'], 
            "CODE": text_dup.loc[idx, 'CODE'], 
            "DSC": response.content
        }
        file.write(f'{json.dumps(output, ensure_ascii=False)}\n')

        print('-----' * 7)
        print(f'ENG: {message}\nKOR: {response.content}')

-----------------------------------
ENG: automotive repair shops, nec  specialized automotive repair, not elsewhere classified, such as fuel service carburetor repair, brake relining, front-end and wheel alignment, and radiator repair. motor vehicle repair and maintenance auto brake lining, installation other automotive mechanical and electrical repair and maintenance maintenance and repair of motor vehicles maintenance and repair of motor vehicles maintenance and repair of motor vehiclesother automotive repair and maintenance
KOR: 자동차 수리점, nec  연료 서비스 기화기 수리, 브레이크 리닝, 프론트 엔드 및 휠 정렬, 라디에이터 수리 등 기타 자동차 수리 및 유지 보수 자동차 수리 및 유지 보수 자동차 브레이크 라이닝, 설치 기타 자동차 기계 및 전기 수리 및 유지 보수 자동차 유지 보수 및 수리 자동차 유지 보수 및 수리 자동차 기타 자동차 수리 및 유지 보수
-----------------------------------
ENG: general farms, primarily animals, nsk  derives 50 percent or more of its total value  of sales of agricultural products from livestock and animal specialties and their products, but less than 50 percent from products of any singl