# **Solar API로 번역하기**
> **해당 데이터를 학습 데이터 활용하여 오히려 점수가 떨어진 점 참고바랍니다.**  
> 아래 코드는 **Upstage AI Lab**의 일상 대화 요약 대회에서 **Solar API를 활용**해 **[SamSum](https://huggingface.co/datasets/samsum)** 데이터셋을 번역했던 코드입니다.   
> **DialogSum** 데이터셋과 유사한 **SamSum** 데이터셋을 번역하였으며 **DialogSum 데이터셋과 최대한 비슷해 지도록** 전처리 과정을 포함하고있습니다.

In [33]:
import pandas as pd
import os
import json
import glob

import random
import re

## 1 Data Load

In [34]:
folder_path = "../data/samsum/"

with open(os.path.join(folder_path, "train.json"), "r", encoding='UTF8') as f:
    train_data = json.load(f)
    
with open(os.path.join(folder_path, "test.json"), "r", encoding='UTF8') as f:
    test_data = json.load(f)
    
with open(os.path.join(folder_path, "val.json"), "r", encoding='UTF8') as f:
    val_data = json.load(f)

In [35]:
df_train = pd.DataFrame(train_data)
df_train = df_train[df_train['dialogue'] != ""].reset_index(drop=True)

df_test = pd.DataFrame(test_data)
df_val = pd.DataFrame(val_data)

del train_data
del test_data
del val_data

In [36]:
df_train

Unnamed: 0,id,summary,dialogue
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\r\...
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa..."
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella...."
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\r\nSam:...
...,...,...,...
14726,13863028,Romeo is trying to get Greta to add him to her...,Romeo: You are on my ‘People you may know’ lis...
14727,13828570,Theresa is at work. She gets free food and fre...,Theresa: <file_photo>\r\nTheresa: <file_photo>...
14728,13819050,Japan is going to hunt whales again. Island an...,John: Every day some bad news. Japan will hunt...
14729,13828395,Celia couldn't make it to the afternoon with t...,Jennifer: Dear Celia! How are you doing?\r\nJe...


## 2 Data Process

In [37]:
# 정규식 패턴에 해당하는 문자열을 replace하는 함수
def remove_extra_spc(x, pattern, replace_text):
    return re.sub(pattern, replace_text, x)

In [38]:
# 이모지 및 특수문자 확인
pattern = r"[^a-zA-Z0-9\s!$%&*_+-=~'\"\|:\\.,/?]"
sp_list = []
for data in df_train['dialogue']:
    sp_list += re.findall(pattern, data)
    
    sp_list = list(set(sp_list))
    
sp_list, len(sp_list)

(['🥪',
  '▽',
  '🍾',
  '🙀',
  '😇',
  'é',
  '😀',
  '💦',
  '🙈',
  '🦌',
  '皿',
  '☜',
  '@',
  '💥',
  '❣',
  '🦄',
  '예',
  '😽',
  '😠',
  'ヽ',
  '😫',
  '̀',
  '🤼',
  '😸',
  '⌣',
  '#',
  '👼',
  'ă',
  '—',
  '⚪',
  '🥗',
  '🦊',
  'ł',
  'Σ',
  '🇮',
  '🥃',
  '🍑',
  '🤡',
  '”',
  'ಥ',
  '☔',
  '☠',
  '🤮',
  '🍺',
  '🥞',
  '🤝',
  'ゞ',
  '😮',
  '🏋',
  '😁',
  'ʘ',
  '🙂',
  '＾',
  '🥳',
  '🔑',
  'ゝ',
  '≧',
  '╬',
  'ェ',
  'ý',
  '‼',
  '／',
  'ロ',
  '🗺',
  '😖',
  '🌟',
  '🎃',
  '😺',
  '－',
  '😍',
  '‑',
  '✿',
  '😎',
  '😘',
  '😩',
  '✨',
  '🎆',
  '🙌',
  '●',
  '😢',
  '🔴',
  '👽',
  'ﾐ',
  '￥',
  '¡',
  '✅',
  '🧘',
  '🤢',
  '🙄',
  'ツ',
  'ｏ',
  '👏',
  '😑',
  'î',
  '😥',
  '🌠',
  'ń',
  '🍸',
  'Θ',
  '～',
  '🤣',
  '🙎',
  '🥂',
  ')',
  '🚱',
  '⬛',
  'ヘ',
  '･',
  '€',
  '┛',
  'θ',
  '☝',
  'コ',
  '丶',
  'ل',
  '🥵',
  '🚣',
  '🏴',
  '📺',
  '😃',
  '🌴',
  '🥕',
  '❌',
  '＊',
  '💓',
  '屮',
  '💜',
  '′',
  '🤟',
  '`',
  '🍫',
  '🐷',
  '͜',
  '💤',
  'ã',
  '🐻',
  '〆',
  '⌒',
  '彡',
  '＼',
  '🧡',
  '🍄',
  '🐈',

In [39]:
# 개행문자 기준으로 대화를 나눴을 때 비어있는 리스트 제거
def remove_empty(x):
    text_list = x.copy()
    for i in range(len(x)):
        if x[i] == "":
            print("empty!")
            text_list.pop(i)
            
    return text_list

In [40]:
# 마침표가 없는 문장에 마침표 추가
def add_fullstop(x):
    pattern = r"[^!?.]$"
    match = re.search(pattern, x)
    if match:
        x+="."
    return x

In [41]:
# 한사람이 연속적으로 발화하는 경우 해당 문장을 합쳐서 반환(원본 학습데이터와 비슷하게 맞추기 위함)
def continuous_talking_paltten(data):
    speak_list = []
    stack_text = data[0]

    pattern = '[a-zA-Z\'\s\-,\._]+:'
    match_now = re.match(pattern, data[0])
    # now_person = match_now.group(0)
    
    try:
        now_person = match_now.group(0)
    except Exception as e:
        # print(e)
        # print(data)
        stack_text += "#Error_Text#"

    for i in range(1, len(data)):    
        match_now = re.match(pattern, data[i])
        
        try:
            if now_person == match_now.group(0):
                now_text = re.sub(pattern, "", data[i], count=1)
                stack_text = stack_text + " " + now_text
                
                if i == len(data)-1:
                    speak_list.append(stack_text)
            else:
                now_person = match_now.group(0)
                speak_list.append(stack_text)
                stack_text = data[i]
                
                if i == len(data)-1:
                    speak_list.append(stack_text)
        except Exception as e:
            # print(e)
            # print(data)
            speak_list.append("#Error_Text#" + data[i])
            
    
    return speak_list

In [42]:
# 대화 순서 저장(확인용)
def get_talking_sequence(x):
    pattern = '[a-zA-Z\'\s\-,\._가-힣]+:'
    talking_sequence = []
    for talk in x:
        person = re.search(pattern, talk).group(0)
        person = person[:-1]
        talking_sequence.append(person)
        
    return talking_sequence

In [43]:
# 특정 특수문자 제거
pattern = r"[^a-zA-Z0-9\s!$%&*_+-=~'\"\|:\\.,/?]"
df_train['dialogue'] = df_train['dialogue'].apply(lambda x: remove_extra_spc(x, pattern, ""))

# 개행문자를 제외한 공백문자 제거
df_train['dialogue'] = df_train['dialogue'].apply(lambda x: re.sub(r"[\r\t]", '', x))

# 발화별 리스트화
df_train['dialogue_list'] = df_train['dialogue'].apply(lambda x: x.split("\n"))

# 빈 리스트 제거
df_train['dialogue_list'] = df_train['dialogue_list'].apply(remove_empty)

# 발화별 좌우공백 제거
df_train['dialogue_list'] = df_train['dialogue_list'].apply(lambda x: [item.strip() for item in x])

# 마침표 찍기
df_train['dialogue_list'] = df_train['dialogue_list'].apply(lambda x: [add_fullstop(item) for item in x])

# 연속된 발화자 이어붙히기
df_train['dialogue_list'] = df_train['dialogue_list'].apply(continuous_talking_paltten)

# 예외 데이터 삭제
df_train['dialogue2'] = df_train['dialogue_list'].apply(lambda x: "\n".join(x))
df_train = df_train[~df_train['dialogue2'].str.contains('#Error_Text#')].reset_index(drop=True)

# 발화자 순서 저장하기
df_train['talking_sequence'] = df_train['dialogue_list'].apply(get_talking_sequence)

empty!
empty!


In [44]:
df_train

Unnamed: 0,id,summary,dialogue,dialogue_list,dialogue2,talking_sequence
0,13818513,Amanda baked cookies and will bring Jerry some...,Amanda: I baked cookies. Do you want some?\nJ...,"[Amanda: I baked cookies. Do you want some?, ...",Amanda: I baked cookies. Do you want some?\nJ...,"[Amanda, Jerry, Amanda]"
1,13728867,Olivia and Olivier are voting for liberals in ...,Olivia: Who are you voting for in this electio...,[Olivia: Who are you voting for in this electi...,Olivia: Who are you voting for in this electio...,"[Olivia, Oliver, Olivia, Oliver]"
2,13681000,Kim may try the pomodoro technique recommended...,"Tim: Hi, what's up?\nKim: Bad mood tbh, I was ...","[Tim: Hi, what's up?, Kim: Bad mood tbh, I was...","Tim: Hi, what's up?\nKim: Bad mood tbh, I was ...","[Tim, Kim, Tim, Kim, Tim, Kim, Tim]"
3,13730747,Edward thinks he is in love with Bella. Rachel...,"Edward: Rachel, I think I'm in ove with Bella....","[Edward: Rachel, I think I'm in ove with Bella...","Edward: Rachel, I think I'm in ove with Bella....","[Edward, rachel, Edward, rachel]"
4,13728094,"Sam is confused, because he overheard Rick com...",Sam: hey overheard rick say something\nSam: i...,[Sam: hey overheard rick say something. i do...,Sam: hey overheard rick say something. i don...,"[Sam, Naomi, Sam, Naomi, Sam, Naomi, Sam, Naom..."
...,...,...,...,...,...,...
14724,13863028,Romeo is trying to get Greta to add him to her...,Romeo: You are on my People you may know list....,[Romeo: You are on my People you may know list...,Romeo: You are on my People you may know list....,"[Romeo, Greta, Romeo, Greta, Romeo]"
14725,13828570,Theresa is at work. She gets free food and fre...,Theresa: <file_photo\nTheresa: <file_photo\nTh...,[Theresa: <file_photo. <file_photo. Hey Loui...,Theresa: <file_photo. <file_photo. Hey Louis...,"[Theresa, Louise, Theresa, Louise, Theresa, Lo..."
14726,13819050,Japan is going to hunt whales again. Island an...,John: Every day some bad news. Japan will hunt...,[John: Every day some bad news. Japan will hun...,John: Every day some bad news. Japan will hunt...,"[John, Erica, John, Faith, Erica, Faith, John,..."
14727,13828395,Celia couldn't make it to the afternoon with t...,Jennifer: Dear Celia! How are you doing?\nJenn...,[Jennifer: Dear Celia! How are you doing? The...,Jennifer: Dear Celia! How are you doing? The ...,"[Jennifer, Celia, Jennifer, Celia, Jennifer, C..."


## 3 Solar API

### 3.1 필요한 라이브러리 설치 및 Import

In [45]:
# !pip install httpx==0.23.2
# !pip install openai==1.2.0

In [46]:
from openai import OpenAI
import time

### 3.2 API 호출 함수 정의

In [55]:
with open("solar_api.txt", "r") as f:
    solar_api = f.read()

In [59]:
def translate_enko(text_en, temperature=0, print_result=False):
    client = OpenAI(
        api_key=solar_api, # <=== [API키 입력]
        base_url="https://api.upstage.ai/v1/solar"
    )

    stream = client.chat.completions.create(
        model="solar-1-mini-translate-enko", # 번역 모델 사용
        messages=[
        {
            "role": "user",
            "content": text_en  # 번역할 텍스트 전달
        }
        ],
        temperature=temperature,
        stream=True,
    )

    trans_str = []
    for chunk in stream:
        test_var = chunk
        if chunk.choices[0].delta.content is not None:
            trans_str.append(chunk.choices[0].delta.content)
            # print(chunk.choices[0].delta.content, end="")
    
    # 출력값 반환
    trans_str = "".join(trans_str)
    # print(trans_str)
    
    if print_result:
        print(f"### Source_text\n{text_en} \n\n### Target_text\n{trans_str}")
    
    return trans_str

In [60]:
df_train.loc[10, 'dialogue2']

'Lucas: Hey! How was your day?\nDemi: Hey there!  It was pretty fine, actually, thank you!  I just got promoted! :D.\nLucas: Whoa! Great news!  Congratulations!  Such a success has to be celebrated.\nDemi: I agree! :D.  Tonight at Death & Co.?\nLucas: Sure!  See you there at 10pm?\nDemi: Yeah! See you there! :D.'

In [61]:
translate_enko(df_train.loc[10, 'dialogue2'], print_result=True)

### Source_text
Lucas: Hey! How was your day?
Demi: Hey there!  It was pretty fine, actually, thank you!  I just got promoted! :D.
Lucas: Whoa! Great news!  Congratulations!  Such a success has to be celebrated.
Demi: I agree! :D.  Tonight at Death & Co.?
Lucas: Sure!  See you there at 10pm?
Demi: Yeah! See you there! :D. 

### Target_text

Lucas: 안녕! 오늘 하루 어땠어?
Demi: 안녕! 사실 꽤 괜찮았어, 고마워! 방금 승진했어! :D.
Lucas: 와! 좋은 소식이야! 축하해! 이런 성공은 축하해야지.
Demi: 동의해! :D. 오늘 밤 Death & Co.에서?
Lucas: 좋아! 10시에 거기서 만나자?
Demi: 그래! 거기서 봐! :D.



'\nLucas: 안녕! 오늘 하루 어땠어?\nDemi: 안녕! 사실 꽤 괜찮았어, 고마워! 방금 승진했어! :D.\nLucas: 와! 좋은 소식이야! 축하해! 이런 성공은 축하해야지.\nDemi: 동의해! :D. 오늘 밤 Death & Co.에서?\nLucas: 좋아! 10시에 거기서 만나자?\nDemi: 그래! 거기서 봐! :D.\n'

### 3.2 API 호출 자동화 및 저장

In [None]:
bundle_num = 5  # 파일을 저장할 단위(중간에 오류가 나거나 커널이 종료되면 비용만 나가고 데이터가 소멸될 것 대비)
data_point = 10900 // bundle_num    # 시작점(오류 등의 이유로 중지될 경우 다시 시작할 위치)

for i in range(data_point, df_train.shape[0] // bundle_num + 1):
    print(i*bundle_num, i*bundle_num+bundle_num-1)
    start = i*bundle_num
    end = i*bundle_num + bundle_num-1
    
    trans_df = pd.DataFrame(columns=['id', 'ko_summary', 'ko_dialogue'])
    
    temp_df = df_train.loc[start:end].copy()
    temp_df['ko_dialogue'] = temp_df['dialogue2'].apply(translate_enko)     # apply 함수를 활용해 번역
    temp_df['ko_summary'] = temp_df['summary'].apply(translate_enko)
    
    trans_df = temp_df[['id', 'ko_summary', 'ko_dialogue']]
    
    # 번역후 파일 저장
    trans_df.to_csv(f"../data/samsum/ko_samsum/train{start}.csv", index=False)
    
    # Solar API가 Beta 버전이라 그런지 한번에 너무 많은 호출을 하면 Too many request로 오류를 반환하기 때문에 1분을 기다려줌
    print("time sleep")
    time.sleep(60)

### 3.3 번역한 데이터 합치기

In [None]:
# 폴더 내 모든 csv 파일 경로 가져오기
def get_csv_files(folder_path):
    files = glob.glob(os.path.join(folder_path, '*.csv'))
    return files

In [None]:
# 폴더 내의 모든 csv 파일 불러와 concat
csv_list= get_csv_files("../data/samsum/ko_samsum")
sam_train = pd.DataFrame(columns=['id', 'ko_summary', 'ko_dialogue'])
for csv in csv_list:
    temp_df = pd.read_csv(csv)
    sam_train = pd.concat([sam_train, temp_df], axis=0)

# 원본 SamSum 데이터셋과 인덱스 순서 맞춰주기
sam_train.dropna(inplace=True)
sam_train['id'] = sam_train['id'].astype(str)
sam_train['id'] = sam_train['id'].str.replace(".0", "")

df_train['id'] = df_train['id'].astype(str)
df_train['index'] = df_train.index
temp_df = df_train[['id', 'index', 'dialogue2', 'summary', 'talking_sequence']]

sam_train = sam_train.merge(temp_df, how='left', on='id')
sam_train = sam_train.sort_values(by='index').reset_index(drop=True)
del sam_train['index']

In [None]:
sam_train.head(1)

Unnamed: 0,id,ko_summary,ko_dialogue,dialogue2,summary,talking_sequence
0,13818513,Amanda는 쿠키를 구웠고 내일 Jerry에게 가져다 줄 것입니다.,\nAmanda: 쿠키를 구웠어. 먹을래?\nJerry: 좋아!\nAmanda: 내...,Amanda: I baked cookies. Do you want some?\nJ...,Amanda baked cookies and will bring Jerry some...,"[Amanda, Jerry, Amanda]"


## 4 Translated Data Process

In [None]:
sam_train['ko_dialogue'] = sam_train['ko_dialogue'].str.strip()
sam_train['ko_dialogue_list'] = sam_train['ko_dialogue'].str.split("\n")

### 4.1 번역된 요약문 중 구어체로 번역된 요약문 제거

In [None]:
ko_summary_list = sam_train['ko_summary'].tolist()
matched_idxs = []
pattern = r"입니다\.$|입니다$|습니다\.$|습니다$|니다\.$|니다$|이다\.$|이다$|있다\.$|있다$|느낀다\.$|느낀다$|졌다$|졌다\.$|렸다\.$|렸다$|났다\.$|났다$|한다\.$|한다$|않다$|않다\.$|했다$|했다\.$|진다\.$|진다$|랐다\.$|랐다"


for idx, summary in enumerate(ko_summary_list):
    matched = re.search(pattern, summary)
    if matched:
        matched_idxs.append(idx)

print(len(matched_idxs))

sam_train = sam_train.loc[matched_idxs]
sam_train = sam_train.reset_index(drop=True)

5234


### 4.2 번역문 중 발화자가 제대로 표시되지 않는 혹은 비어있는 값 제거

In [None]:
def get_talking_sequence2(x):
    pattern = '[a-zA-Z\'\s\-,\._가-힣]+:'
    talking_sequence = []
    for talk in x:
        matched = re.search(pattern, talk)
        
        if matched:
            person = matched.group(0)
        else:
            person = "#ErrorMatch#:"
        
        person = person[:-1]
        talking_sequence.append(person)
        
    return talking_sequence

In [None]:
# 발화자 순서 저장하기
sam_train['ko_talking_sequence'] = sam_train['ko_dialogue_list'].apply(get_talking_sequence2)

In [None]:
# 발화자를 추출하는 과정에서 오류가 발생했던 데이터 제거
talk_list_list = sam_train['ko_talking_sequence'].tolist()
idx_list = []
for idx, talk_list in enumerate(talk_list_list):
    if "#ErrorMatch#" in talk_list:
        idx_list.append(idx)
        
sam_train = sam_train.drop(idx_list).reset_index(drop=True)

### 4.3 DialogSum 데이터셋과 비슷하게 발화자 마스킹

In [None]:
from collections import OrderedDict

# DialogSum 데이터와 비슷하게 사람 마스킹하기
ko_dialogue_list = sam_train['ko_dialogue_list'].tolist()   # 대화문 리스트
ko_talking_sequence = sam_train['ko_talking_sequence'].tolist()     # 발화 순서
ko_summary = sam_train['ko_summary'].tolist() # 요약문 리스트

new_dialogue_list = []  # 새롭게 생성할 대화문
new_summary_list = []   # 새롭게 생성할 요약문

for i in range(0, len(ko_dialogue_list)):
    temp_dialogue_list = []
    
    person_list = list(OrderedDict.fromkeys(ko_talking_sequence[i]))    # 순서를 유지한체 중복 제거
    person_dict = {}    # 발화자 별 마스킹값 할당
    for idx, person in enumerate(person_list):
        person_dict[person] = f"#Person{idx+1}#"
    
    # print(ko_dialogue_list[i])
    # print(ko_talking_sequence[i])
    
    # person_dict에 할당된 값에 따라서 발화자 순서에 따라서 마스킹 값으로 대체
    for j in range(0, len(ko_dialogue_list[i])):
        pattern = f"^{ko_talking_sequence[i][j]}:"
        new_str = re.sub(pattern, person_dict[ko_talking_sequence[i][j]]+":", ko_dialogue_list[i][j], count=1)
        temp_dialogue_list.append(new_str)
    
    # person_dict에 할당된 값에 따라서 요약문에 있는 발화자를 마스킹 값으로 대체
    new_summary = ko_summary[i]
    for k, v in person_dict.items():
        # print(k, v)
        pattern = f"{k}"
        new_summary = re.sub(pattern, v, new_summary)
        
    new_dialogue_list.append(temp_dialogue_list)
    new_summary_list.append(new_summary)

### 4.4 데이터 재구성 및 저장

In [None]:
sam_train['processed_dialogue_list'] = new_dialogue_list
sam_train['processed_dialogue'] = sam_train['processed_dialogue_list'].apply(lambda x: "\n".join(x))
sam_train['processed_summary'] = new_summary_list

In [None]:
new_samsum = sam_train[['id', 'ko_summary', 'ko_dialogue', 'processed_dialogue', 'processed_summary']]

In [None]:
new_samsum.head(3)

Unnamed: 0,id,ko_summary,ko_dialogue,processed_dialogue,processed_summary
0,13818513,Amanda는 쿠키를 구웠고 내일 Jerry에게 가져다 줄 것입니다.,Amanda: 쿠키를 구웠어. 먹을래?\nJerry: 좋아!\nAmanda: 내일 ...,#Person1#: 쿠키를 구웠어. 먹을래?\n#Person2#: 좋아!\n#Per...,#Person1#는 쿠키를 구웠고 내일 #Person2#에게 가져다 줄 것입니다.
1,13728867,올리비아와 올리비에는 이번 선거에서 자유당 후보에게 투표할 예정입니다.,올리비아: 이번 선거에서 누구를 뽑을 거야?\n올리버: 난 항상 자유당.\n올리비아...,#Person1#: 이번 선거에서 누구를 뽑을 거야?\n#Person2#: 난 항상...,#Person1#와 올리비에는 이번 선거에서 자유당 후보에게 투표할 예정입니다.
2,13681000,김씨는 Tim이 추천한 pomodoro 기법을 시도해 볼 수 있습니다,"Tim: 안녕, 어떻게 지내?\nKim: 사실 기분이 안 좋아, 많은 일을 하려고 ...","#Person1#: 안녕, 어떻게 지내?\n#Person2#: 사실 기분이 안 좋아...",김씨는 #Person1#이 추천한 pomodoro 기법을 시도해 볼 수 있습니다


In [None]:
new_samsum.to_csv("../data/ko_samsum.csv", index=False)