In [None]:
# Data gen from https://huggingface.co/datasets/junelee/sharegpt_deepl_ko

In [4]:
import json
import concurrent.futures

def extract_values(json_data_1, json_data_2):
    values_list = []
    
    for item_1, item_2 in zip(json_data_1, json_data_2):
        
        conversations_1 = item_1.get("conversations", [])
        conversations_2 = item_2.get("conversations", [])
        
        if len(conversations_1) != len(conversations_2):
            continue
        
        for conversation_1, conversation_2 in zip(conversations_1, conversations_2):
            value_1 = conversation_1.get("value")
            value_2 = conversation_2.get("value")
            
            temp_dict = {} 
            
            temp_dict['instruction'] = "translate the following into korean"
            temp_dict['input'] = value_1
            temp_dict['output'] = value_2
            
            if value_1 and value_2:
                values_list.append(temp_dict)
    
    return values_list

def process_json_files(file_path_1, file_path_2, chunk_size, output_filepath):
    with open(file_path_1) as file:
        data_1 = json.load(file)

    with open(file_path_2) as file:
        data_2 = json.load(file)

    data_1_chunks = [data_1[i:i+chunk_size] for i in range(0, len(data_1), chunk_size)]
    data_2_chunks = [data_2[i:i+chunk_size] for i in range(0, len(data_2), chunk_size)]

    values = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for chunk_1, chunk_2 in zip(data_1_chunks, data_2_chunks):
            future = executor.submit(extract_values, chunk_1, chunk_2)
            futures.append(future)

        for future in concurrent.futures.as_completed(futures):
            values.extend(future.result())

    with open(output_filepath, "w", encoding="utf-8") as file:
        json.dump(values, file, ensure_ascii=False,  indent=4, separators=(',', ':')) 


In [5]:
file_path_1 = r"C:\Users\parkm\Desktop\llm\ko-alpaca-lingo\data\dbdu\ShareGPT-74k-ko\original_shargpt.json"
file_path_2 = r"C:\Users\parkm\Desktop\llm\ko-alpaca-lingo\data\dbdu\ShareGPT-74k-ko\ko_sharegpt.json"
chunk_size = 9000000
output_filepath = "ko_shargpt_deepl_translate_v1.json"
process_json_files(file_path_1, file_path_2, chunk_size, output_filepath)


In [None]:
# Data cleansing

In [21]:
import time
from utilfunction import find_path
import json
import concurrent.futures
import re


class DataProcessor:
    def __init__(self, input_file_path):
        with open(input_file_path) as f:
            self.data = json.load(f)
        self.excluded_data = []

    def step_completion(func):
        def wrapper(*args, **kwargs):
            start_time = time.time()
            result = func(*args, **kwargs)
            end_time = time.time()
            print(f"{func.__name__} completed successfully in {end_time - start_time:.2f} seconds.")
            return result
        return wrapper

    @staticmethod
    @step_completion
    def remove_specific_words(data):
        word_set = {'sure!', 'great!', 'Certainly!', "Sure!", "Great!", "Great, ", "great, ",
                    "sure, ", "Sure, ", "Sure! "}
        return [d for d in data if not any(word in d['input'] for word in word_set)]

    @staticmethod
    @step_completion
    def remove_short_fields(data):
        try:
            return [d for d in data if (input_len := len(d['input'])) > 4 and (output_len := len(d['output'])) > 4]
        except Exception as e:
            print(f"Error at remove_short_fields {e} \n Finished unsuccessfully.")
            return data

    @staticmethod
    @step_completion
    def replace_sure_translation(data):
        for d in data:
            try:
                d['output'] = re.sub(r'\b물론,\b', '물론이죠.', d['output'])
                d['output'] = re.sub(r'\b확실히,\b', '', d['output'])
                d['output'] = re.sub(r'\b예,\b', '네.', d['output'])
            except Exception as e:
                print(f"Error at replace_sure_translation {e}")
        return data

    @staticmethod
    @step_completion
    def delete_error_korean_prefix(data):
        for d in data:
            try:
                d['output'] = re.sub(r'^(은 |는 )', '', d['output'])
            except Exception as e:
                print(f"Error at delete_error_korean_prefix {e}")
        return data

    @staticmethod
    @step_completion
    def replace_output_prefix(data):
        prefix_set = {"을 ", "를 ", "이 ", "가 ", "h", "은 ", "는 ", "에 ", "으 ", "의", "예, ", "^[A-Za-z] ", "^[ㄱ-ㅎㅏ-ㅣ가-힣] ", "^[0-9] ", ".", ","}
        exclued_data = []

        for d in data:
            try:
                output_text = d['output']
                if output_text.startswith(tuple(prefix_set)):
                    exclued_data.append(d)
            except Exception as e:
                print(f"Error at replace_output_prefix {e}")
            
        return exclued_data


    @staticmethod
    @step_completion
    def do_not_translate_code_snippet(data):
        for d in data:
            try:
                input_text = d['input']
                output_text = d['output']

                if '```' in input_text and '```' in output_text:
                    start_index = input_text.find('```') + 3
                    end_index = input_text.find('```', start_index)
                    replace_text = input_text[start_index:end_index]

                    d['output'] = output_text.replace('```', f' ```{replace_text}```')
            except Exception as e:
                print(f"Error at do_not_translate_code_snippet {e}")

        return data


    @staticmethod
    @step_completion
    def remove_duplicates(data):
        unique_data = []
        seen_inputs = set()
        seen_outputs = set()
        for d in data:
            input_value = d["input"]
            output_value = d["output"]
            if (input_value, output_value) not in seen_inputs and \
                    (input_value, output_value) not in seen_outputs and \
                    input_value != output_value:
                seen_inputs.add((input_value, output_value))
                seen_outputs.add((input_value, output_value))
                unique_data.append(d)
        return unique_data

    @staticmethod
    @step_completion
    def remove_deletion_and_addition(data):
        for d in data:
            input_value = d["input"]
            output_value = d["output"]

            input_words = input_value.split()
            output_words = output_value.split()
            try:
                if len(output_words[0]) > 1 and len(input_words[0]) > 2:
                    if len(set(output_words[0].lower()) - set(input_words[0].lower())) < 2 and \
                            input_words[0][1].lower() == output_words[0][0].lower() and \
                            input_words[0][2].lower() == output_words[0][1].lower():
                        output_words[0] = input_words[0]
                        output_value = " ".join(output_words)
                        d["output"] = output_value
            except:
                continue
            
            if output_words[0] == "물론,":
                output_words[0] = "물론이죠. "
                output_value = " ".join(output_words)
                d["output"] = output_value

            if len(output_words[0]) == 1 and output_words[0].isalpha() and output_words[0].isascii() and output_words[0].lower() != "a":
                output_words[0] = ""
                output_value = " ".join(output_words)
                d["output"] = output_value

        return data

    @staticmethod
    def flatten_list(data):
        flattened_list = []
        for sublist in data:
            if isinstance(sublist, list):
                flattened_list.extend(DataProcessor.flatten_list(sublist))
            else:
                flattened_list.append(sublist)
        return flattened_list

    @staticmethod
    @step_completion
    def write_to_file(data, file_path):
        with open(file_path, "w", encoding="utf-8") as file:
            json.dump(data, file, ensure_ascii=False, indent=4, separators=(',', ':'))

    @step_completion
    def process_json_file(self, steps, output_file_path, dummy_file_path):
        for step in steps:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                if 'step1' in step:
                    self.data = list(executor.map(DataProcessor.remove_specific_words, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step2' in step:
                    self.data = list(executor.map(DataProcessor.remove_short_fields, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step3' in step:
                    self.data = list(executor.map(DataProcessor.replace_sure_translation, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step4' in step:
                    self.data = list(executor.map(DataProcessor.delete_error_korean_prefix, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step5' in step:
                    self.data = list(executor.map(DataProcessor.do_not_translate_code_snippet, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step6' in step:
                    self.data = list(executor.map(DataProcessor.remove_duplicates, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

                if 'step7' in step:
                    self.excluded_data = list(executor.map(self.replace_output_prefix, [self.data]))
                    self.excluded_data = DataProcessor.flatten_list(self.excluded_data)
                    self.data = [d for d in self.data if d not in self.excluded_data]

                if 'step8' in step:
                    self.data = list(executor.map(DataProcessor.remove_deletion_and_addition, [self.data]))
                    self.data = DataProcessor.flatten_list(self.data)

        self.data = DataProcessor.flatten_list(self.data)
        self.write_to_file(self.data, output_file_path)
        self.write_to_file(self.excluded_data, dummy_file_path)


    @staticmethod
    def static_process_json_file(steps, input_file_path, output_file_path, dummy_file_path):
        print(f'Qued steps: {steps}')
        with open(input_file_path) as f:
            data = json.load(f)
        excluded_data = []
        
        for step in steps:
            with concurrent.futures.ThreadPoolExecutor() as executor:
                if 'step1' in step:
                    data = list(executor.map(DataProcessor.remove_specific_words, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step2' in step:
                    data = list(executor.map(DataProcessor.remove_short_fields, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step3' in step:
                    data = list(executor.map(DataProcessor.replace_sure_translation, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step4' in step:
                    data = list(executor.map(DataProcessor.delete_error_korean_prefix, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step5' in step:
                    data = list(executor.map(DataProcessor.do_not_translate_code_snippet, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step6' in step:
                    data = list(executor.map(DataProcessor.remove_duplicates, [data]))
                    data = DataProcessor.flatten_list(data)

                if 'step7' in step:
                    excluded_data = list(executor.map(DataProcessor.replace_output_prefix, [data]))
                    excluded_data = DataProcessor.flatten_list(excluded_data)
                    data = [d for d in data if d not in excluded_data]

                if 'step8' in step:
                    data = list(executor.map(DataProcessor.remove_deletion_and_addition, [data]))
                    data = DataProcessor.flatten_list(data)

        data = DataProcessor.flatten_list(data)
        DataProcessor.write_to_file(data, output_file_path)
        DataProcessor.write_to_file(excluded_data, dummy_file_path)

In [23]:
dp = DataProcessor('ko_shargpt_deepl_translate_cleaned_v1.json')
output_file_path = 'ko_shargpt_deepl_translate_cleaned_v2.json'
dummy_file_path = 'ko_shargpt_deepl_cleaned_dummy.json'
# steps = ['step'+str(i) for i in range(9)]
steps = ['step8']
dp.process_json_file(steps, output_file_path, dummy_file_path)

remove_deletion_and_addition completed successfully in 8.89 seconds.
write_to_file completed successfully in 14.55 seconds.
write_to_file completed successfully in 0.00 seconds.
process_json_file completed successfully in 23.67 seconds.
