## 1. Add speech ID and filter text
### Read data

In [None]:
import json 
sft_data_path = 'mgm_instruction_clear.json'
sft_data = json.load(open(sft_data_path, 'r'))
print(len(sft_data))
print(sft_data[0])

### Add unique speech id

In [None]:
import copy
new_sft_data_list = []
for cnt, item in enumerate(sft_data):
    new_item = copy.deepcopy(item)
    new_item["speech_id"] = "mgm_instruction_clear_" + str(cnt)
    new_sft_data_list.append(new_item)

print(len(new_sft_data_list))
print(new_sft_data_list[0])

# json to file
save_path = 'mgm_instruction_clear_with_speechid.json'
with open(save_path, 'w', encoding='utf-8') as json_file:
    json.dump(new_sft_data_list, json_file, ensure_ascii=False, indent=4)

### Extract text to convert

In [None]:
import re
def is_en_str(string):
    pattern = re.compile(r'[a-zA-Z0-9\s.,!?\'"()-]*')
    if pattern.fullmatch(string):
        return True
    else:
        return False

def replace_choice(string):
    return string.replace("\nA.", "\nOption A is ").replace("\nB.", "\nOption B is ").replace("\nC.", "\nOption C is ").replace("\nD.", "\nOption D is ")

In [None]:
new_text_list = []
one_round = 0
for cnt, item in enumerate(sft_data):
    if item['conversations'][-2]['from'] != 'human':
        continue
    
    question = item['conversations'][-2]['value'].replace('<image>\n', '').replace('\n<image>', '').replace('<image>', '')
    question = replace_choice(question)
    new_item = {
        'speech_id': item['speech_id'],
        'question': question,
        'conv_length': len(item['conversations'])
    }
    new_text_list.append(new_item)

print(one_round)
print(len(new_text_list))
print(new_text_list[0])

# json to file
save_path = 'sft_ques_total_with_modified_options.json'
with open(save_path, 'w', encoding='utf-8') as json_file:
    json.dump(new_text_list, json_file, ensure_ascii=False, indent=4)

### Special process for OCR data

In [None]:
ocr_ques_list = []
for ques in new_text_list:
    if "\nReference OCR" in ques["question"]: # and "\nAnswer" in ques["question"]:
        new_ques = copy.deepcopy(ques)
        new_ques["question"] = new_ques["question"].split("\nReference OCR")[0]
        ocr_ques_list.append(new_ques)
print(len(ocr_ques_list), ocr_ques_list[:2])

# json to file
save_path = 'sft_ques_textOCR.json'
with open(save_path, 'w', encoding='utf-8') as json_file:
    json.dump(ocr_ques_list, json_file, ensure_ascii=False, indent=4)

## 2. Reorganize SFT annotation data
### tag all successfully generated speeches

In [None]:
mgm_sft_speech_path = "mgm_sft_speech/Lyra_speeches"
mgm_sft_speech_ids = {}
import os
for speech_path in os.listdir(mgm_sft_speech_path):
    mgm_sft_speech_ids[speech_path.split('.')[0]] = 1
    

In [None]:
short_path = "mgm_sft_speech/Lyra_speeches"
sft_data_path= 'mgm_instruction_clear_with_speechid.json'
sft_data = json.load(open(sft_data_path))
new_text_list = []
for cnt, item in enumerate(sft_data):
    new_item = copy.deepcopy(item)
    if new_item['speech_id'] in mgm_sft_speech_ids:
        text_question = new_item['conversations'][-2]['value']            
        if len(new_item['conversations']) == 2:
            if 'image' in new_item:
                new_item['conversations'][-2]['value'] = "<image>\n<speech>"
            else:
                new_item['conversations'][-2]['value'] = "<speech>"
        else:
            new_item['conversations'][-2]['value'] = "<speech>"
            
        if "\nReference OCR" in text_question: # Deal with OCR data
            new_item['conversations'][-2]['value'] = new_item['conversations'][-2]['value'] + "\nReference OCR" + text_question.split("\nReference OCR")[1]
            text_question = text_question.split("\nReference OCR")[0]
            
        text_question = replace_choice(text_question)

        if len(text_question) >= 10:
            new_item['speech_asr'] = text_question.replace('<image>\n', '').replace('\n<image>', '').replace('<image>', '')
        new_item['speech'] = os.path.join(short_path, "{}.mp3".format(new_item['speech_id']))
    new_text_list.append(new_item)

print(len(new_text_list))
print(new_text_list[0])

# json to file
save_path = 'mgm_instruction_clear_with_speech.json'
with open(save_path, 'w', encoding='utf-8') as json_file:
    json.dump(new_text_list, json_file, ensure_ascii=False, indent=4)