In [1]:
import pandas as pd
import glob
from konlpy.tag import Kkma
from konlpy.utils import pprint
from tqdm import tqdm


kkma = Kkma()


existing_xlsx_list = glob.glob("./existing/*.xlsx")
emerging_xlsx_list = glob.glob("./emerging/*.xlsx")
#print (xlsx_list)

existing_dataset = {}
emerging_dataset = {}

def parse_sentence_from_dataset(dataset_path, max_len):
    intent = dataset_path.split(" ")[1].split("(")[0]
    dataset = pd.read_excel(dataset_path)
    data = dataset[["SENTENCEID", "SENTENCE", "QA"]]
    sentences = []
    cur_sentence = ""
    sentence_id = 0
    for idx, datum in data.iterrows():
        if int(datum["SENTENCEID"]) > sentence_id:
            sentence_id = int(datum["SENTENCEID"])
            next_sentence = cur_sentence + " " + str(datum["SENTENCE"])
            if len(next_sentence.split(" ")) > max_len:
                # return cur_sentence as sentence and begin new sentence with current datum
                sentences.append(cur_sentence)
                sentence_id = int(datum["SENTENCEID"])
                cur_sentence = str(datum["SENTENCE"])
            else:
                cur_sentence = next_sentence
        else:
            sentences.append(cur_sentence)
            sentence_id = int(datum["SENTENCEID"])
            cur_sentence = str(datum["SENTENCE"])
    return intent, sentences
print (f"glob.glob('./exisitng/') : {existing_xlsx_list}")

for existing_xlsx in existing_xlsx_list:
    intent, sentences = parse_sentence_from_dataset(existing_xlsx, 20)
    existing_dataset[intent] = sentences
    
for emerging_xlsx in emerging_xlsx_list:
    intent, sentences = parse_sentence_from_dataset(emerging_xlsx, 20)
    emerging_dataset[intent] = sentences

#print (existing_dataset, emerging_dataset)


glob.glob('./exisitng/') : ['./existing/E 생활서비스(11,087).xlsx', './existing/H 관광여가오락(4,949).xlsx', './existing/C 학원(4,773).xlsx', './existing/D 소매점(14,949).xlsx', './existing/B 의류(15,826).xlsx', './existing/A 음식점(15,726).xlsx', './existing/F 카페(7,859).xlsx', './existing/G 숙박업(7,113).xlsx']


In [2]:
def intent2morpheme(intent):
    result_intent = intent
    if intent == "생활서비스":
        result_intent = "생활 서비스"
    elif intent == "숙박업":
        result_intent = "숙박 업"
    elif intent == "관광여가오락":
        result_intent = "관광 여가 오락"
    elif intent == "떡집":
        result_intent = "떡 집"
    elif intent == "미술학원":
        result_intent = "미술 학원"
    elif intent == "배달음식점":
        result_intent = "배달 음식점"
    return result_intent

existing_text_file = open("./existing.txt", "w", encoding="utf-8")
emerging_text_file = open("./emerging.txt", "w", encoding="utf-8")

# collect existing intent and emerging intents

existing_text_list = []
emerging_text_list = []

for existing_intent in tqdm(existing_dataset):
    existing_contents = existing_dataset[existing_intent]
    for existing_content in tqdm(existing_contents):
        processed_existing_content = " ".join(kkma.nouns(existing_content)).strip()
        #processed_existing_content = " ".join(list(map(lambda x : x[0], kkma.pos(existing_content)))).strip()
        
        if processed_existing_content == "" or (len(processed_existing_content.split()) <= 3):
            continue
        existing_text_string = intent2morpheme(existing_intent)+ "\t" + processed_existing_content + "\n"
        existing_text_list.append(existing_text_string)
        
for emerging_intent in tqdm(emerging_dataset):
    emerging_contents = emerging_dataset[emerging_intent]
    for emerging_content in tqdm(emerging_contents):
        
        processed_emerging_content = " ".join(kkma.nouns(emerging_content)).strip()
        #processed_emerging_content = " ".join(list(map(lambda x : x[0], kkma.pos(emerging_content)))).strip()
        if processed_emerging_content == "" or (len(processed_emerging_content.split()) <= 3):
            continue
        emerging_text_string = intent2morpheme(emerging_intent)+ "\t" + processed_emerging_content + "\n"
        emerging_text_list.append(emerging_text_string)

  0%|          | 0/8 [00:00<?, ?it/s]
  0%|          | 0/3470 [00:00<?, ?it/s][A
  0%|          | 1/3470 [00:03<3:06:36,  3.23s/it][A
  0%|          | 4/3470 [00:03<2:11:35,  2.28s/it][A
  0%|          | 7/3470 [00:03<1:32:40,  1.61s/it][A
  0%|          | 11/3470 [00:03<1:05:19,  1.13s/it][A
  0%|          | 17/3470 [00:03<46:22,  1.24it/s]  [A
  1%|          | 23/3470 [00:04<32:41,  1.76it/s][A
  1%|          | 28/3470 [00:04<23:12,  2.47it/s][A
  1%|          | 32/3470 [00:04<16:42,  3.43it/s][A
  1%|          | 37/3470 [00:04<12:02,  4.75it/s][A
  1%|▏         | 44/3470 [00:04<08:43,  6.54it/s][A
  1%|▏         | 51/3470 [00:04<06:26,  8.84it/s][A
  2%|▏         | 57/3470 [00:04<04:49, 11.80it/s][A
  2%|▏         | 62/3470 [00:04<03:45, 15.12it/s][A
  2%|▏         | 67/3470 [00:04<02:58, 19.03it/s][A
  2%|▏         | 72/3470 [00:05<02:34, 21.97it/s][A
  2%|▏         | 80/3470 [00:05<02:03, 27.41it/s][A
  3%|▎         | 87/3470 [00:05<01:41, 33.29it/s][A
  3%|▎    

 30%|███       | 1057/3470 [00:21<00:40, 59.75it/s][A
 31%|███       | 1067/3470 [00:21<00:38, 62.32it/s][A
 31%|███       | 1075/3470 [00:21<00:36, 66.07it/s][A
 31%|███       | 1083/3470 [00:22<00:34, 69.35it/s][A
 31%|███▏      | 1091/3470 [00:22<00:34, 69.83it/s][A
 32%|███▏      | 1099/3470 [00:22<00:34, 69.28it/s][A
 32%|███▏      | 1109/3470 [00:22<00:31, 76.09it/s][A
 32%|███▏      | 1117/3470 [00:22<00:30, 77.13it/s][A
 32%|███▏      | 1125/3470 [00:22<00:33, 70.95it/s][A
 33%|███▎      | 1133/3470 [00:22<00:35, 66.44it/s][A
 33%|███▎      | 1144/3470 [00:22<00:32, 70.97it/s][A
 33%|███▎      | 1154/3470 [00:22<00:29, 77.54it/s][A
 34%|███▎      | 1163/3470 [00:23<00:32, 70.44it/s][A
 34%|███▎      | 1171/3470 [00:23<00:33, 68.09it/s][A
 34%|███▍      | 1179/3470 [00:23<00:38, 58.86it/s][A
 34%|███▍      | 1188/3470 [00:23<00:35, 64.74it/s][A
 34%|███▍      | 1195/3470 [00:23<00:36, 62.67it/s][A
 35%|███▍      | 1202/3470 [00:23<00:37, 59.92it/s][A
 35%|███▍ 

 62%|██████▏   | 2161/3470 [00:39<00:27, 47.07it/s][A
 63%|██████▎   | 2169/3470 [00:40<00:25, 50.10it/s][A
 63%|██████▎   | 2176/3470 [00:40<00:23, 54.38it/s][A
 63%|██████▎   | 2182/3470 [00:40<00:24, 51.91it/s][A
 63%|██████▎   | 2188/3470 [00:40<00:25, 51.26it/s][A
 63%|██████▎   | 2196/3470 [00:40<00:23, 53.77it/s][A
 64%|██████▎   | 2205/3470 [00:40<00:20, 60.43it/s][A
 64%|██████▎   | 2212/3470 [00:40<00:20, 61.65it/s][A
 64%|██████▍   | 2219/3470 [00:40<00:20, 59.78it/s][A
 64%|██████▍   | 2227/3470 [00:41<00:20, 60.62it/s][A
 64%|██████▍   | 2236/3470 [00:41<00:18, 66.99it/s][A
 65%|██████▍   | 2244/3470 [00:41<00:19, 61.63it/s][A
 65%|██████▍   | 2251/3470 [00:41<00:20, 59.79it/s][A
 65%|██████▌   | 2258/3470 [00:41<00:21, 56.85it/s][A
 65%|██████▌   | 2265/3470 [00:41<00:20, 57.80it/s][A
 66%|██████▌   | 2274/3470 [00:41<00:18, 64.12it/s][A
 66%|██████▌   | 2281/3470 [00:41<00:19, 61.43it/s][A
 66%|██████▌   | 2288/3470 [00:42<00:19, 61.91it/s][A
 66%|█████

 93%|█████████▎| 3235/3470 [00:58<00:04, 52.25it/s][A
 93%|█████████▎| 3241/3470 [00:58<00:04, 51.09it/s][A
 94%|█████████▎| 3247/3470 [00:58<00:04, 53.46it/s][A
 94%|█████████▍| 3257/3470 [00:58<00:03, 61.37it/s][A
 94%|█████████▍| 3264/3470 [00:58<00:03, 61.13it/s][A
 94%|█████████▍| 3271/3470 [00:58<00:03, 57.50it/s][A
 94%|█████████▍| 3278/3470 [00:58<00:03, 58.63it/s][A
 95%|█████████▍| 3286/3470 [00:58<00:03, 60.36it/s][A
 95%|█████████▍| 3294/3470 [00:58<00:02, 64.67it/s][A
 95%|█████████▌| 3301/3470 [00:59<00:02, 61.52it/s][A
 95%|█████████▌| 3308/3470 [00:59<00:02, 59.99it/s][A
 96%|█████████▌| 3315/3470 [00:59<00:02, 61.87it/s][A
 96%|█████████▌| 3324/3470 [00:59<00:02, 63.22it/s][A
 96%|█████████▌| 3332/3470 [00:59<00:02, 66.45it/s][A
 96%|█████████▌| 3339/3470 [00:59<00:02, 65.14it/s][A
 96%|█████████▋| 3346/3470 [00:59<00:02, 60.58it/s][A
 97%|█████████▋| 3353/3470 [00:59<00:01, 60.93it/s][A
 97%|█████████▋| 3362/3470 [01:00<00:01, 61.87it/s][A
 97%|█████

 57%|█████▋    | 729/1270 [00:14<00:08, 63.26it/s][A
 58%|█████▊    | 737/1270 [00:14<00:08, 61.58it/s][A
 59%|█████▉    | 749/1270 [00:14<00:07, 71.87it/s][A
 60%|█████▉    | 757/1270 [00:14<00:07, 67.19it/s][A
 60%|██████    | 765/1270 [00:14<00:07, 64.93it/s][A
 61%|██████    | 772/1270 [00:14<00:08, 61.91it/s][A
 61%|██████▏   | 779/1270 [00:15<00:07, 62.62it/s][A
 62%|██████▏   | 786/1270 [00:15<00:07, 61.52it/s][A
 63%|██████▎   | 795/1270 [00:15<00:07, 67.66it/s][A
 63%|██████▎   | 803/1270 [00:15<00:07, 63.34it/s][A
 64%|██████▍   | 810/1270 [00:15<00:07, 62.22it/s][A
 64%|██████▍   | 817/1270 [00:15<00:07, 58.28it/s][A
 65%|██████▍   | 824/1270 [00:15<00:08, 55.27it/s][A
 65%|██████▌   | 831/1270 [00:15<00:07, 56.17it/s][A
 66%|██████▌   | 837/1270 [00:16<00:08, 50.62it/s][A
 67%|██████▋   | 845/1270 [00:16<00:08, 51.47it/s][A
 67%|██████▋   | 851/1270 [00:16<00:07, 53.48it/s][A
 67%|██████▋   | 857/1270 [00:16<00:07, 52.40it/s][A
 68%|██████▊   | 863/1270 [0

 29%|██▉       | 521/1781 [00:08<00:22, 56.91it/s][A
 30%|██▉       | 529/1781 [00:08<00:20, 61.87it/s][A
 30%|███       | 536/1781 [00:08<00:20, 60.81it/s][A
 30%|███       | 543/1781 [00:08<00:21, 56.81it/s][A
 31%|███       | 549/1781 [00:08<00:21, 56.88it/s][A
 31%|███       | 556/1781 [00:08<00:22, 55.33it/s][A
 32%|███▏      | 564/1781 [00:09<00:21, 55.60it/s][A
 32%|███▏      | 570/1781 [00:09<00:21, 55.64it/s][A
 32%|███▏      | 576/1781 [00:09<00:22, 52.85it/s][A
 33%|███▎      | 582/1781 [00:09<00:23, 50.76it/s][A
 33%|███▎      | 588/1781 [00:09<00:23, 51.44it/s][A
 33%|███▎      | 596/1781 [00:09<00:22, 53.61it/s][A
 34%|███▍      | 604/1781 [00:09<00:20, 58.56it/s][A
 34%|███▍      | 611/1781 [00:09<00:19, 59.21it/s][A
 35%|███▍      | 618/1781 [00:10<00:21, 54.80it/s][A
 35%|███▌      | 624/1781 [00:10<00:22, 52.11it/s][A
 35%|███▌      | 630/1781 [00:10<00:21, 53.36it/s][A
 36%|███▌      | 637/1781 [00:10<00:22, 51.33it/s][A
 36%|███▌      | 645/1781 [0

  5%|▍         | 198/4223 [00:03<01:29, 45.05it/s][A
  5%|▍         | 203/4223 [00:03<01:29, 44.83it/s][A
  5%|▍         | 209/4223 [00:03<01:26, 46.15it/s][A
  5%|▌         | 216/4223 [00:04<01:18, 51.29it/s][A
  5%|▌         | 222/4223 [00:04<01:21, 48.92it/s][A
  5%|▌         | 228/4223 [00:04<01:19, 50.38it/s][A
  6%|▌         | 234/4223 [00:04<01:21, 48.65it/s][A
  6%|▌         | 240/4223 [00:04<01:24, 46.99it/s][A
  6%|▌         | 250/4223 [00:04<01:16, 51.96it/s][A
  6%|▌         | 258/4223 [00:04<01:09, 57.45it/s][A
  6%|▋         | 265/4223 [00:04<01:10, 56.37it/s][A
  6%|▋         | 271/4223 [00:05<01:13, 53.51it/s][A
  7%|▋         | 277/4223 [00:05<01:13, 53.74it/s][A
  7%|▋         | 283/4223 [00:05<01:16, 51.18it/s][A
  7%|▋         | 289/4223 [00:05<01:14, 52.76it/s][A
  7%|▋         | 296/4223 [00:05<01:11, 54.66it/s][A
  7%|▋         | 302/4223 [00:05<01:12, 54.06it/s][A
  7%|▋         | 310/4223 [00:05<01:06, 58.90it/s][A
  8%|▊         | 317/4223 [0

 32%|███▏      | 1334/4223 [00:22<00:48, 60.16it/s][A
 32%|███▏      | 1342/4223 [00:22<00:45, 63.70it/s][A
 32%|███▏      | 1349/4223 [00:22<00:46, 61.31it/s][A
 32%|███▏      | 1356/4223 [00:22<00:49, 58.33it/s][A
 32%|███▏      | 1362/4223 [00:22<00:48, 58.45it/s][A
 32%|███▏      | 1368/4223 [00:22<00:48, 58.42it/s][A
 33%|███▎      | 1375/4223 [00:22<00:46, 61.20it/s][A
 33%|███▎      | 1382/4223 [00:23<00:44, 63.32it/s][A
 33%|███▎      | 1390/4223 [00:23<00:43, 65.73it/s][A
 33%|███▎      | 1397/4223 [00:23<00:45, 62.77it/s][A
 33%|███▎      | 1404/4223 [00:23<00:48, 57.76it/s][A
 33%|███▎      | 1410/4223 [00:23<00:49, 57.30it/s][A
 34%|███▎      | 1416/4223 [00:23<00:52, 53.66it/s][A
 34%|███▎      | 1423/4223 [00:23<00:51, 54.05it/s][A
 34%|███▍      | 1434/4223 [00:23<00:44, 63.00it/s][A
 34%|███▍      | 1441/4223 [00:24<00:47, 58.17it/s][A
 34%|███▍      | 1448/4223 [00:24<00:50, 55.17it/s][A
 34%|███▍      | 1454/4223 [00:24<00:49, 56.36it/s][A
 35%|███▍ 

 59%|█████▉    | 2498/4223 [00:40<00:26, 64.18it/s][A
 59%|█████▉    | 2505/4223 [00:40<00:26, 65.11it/s][A
 59%|█████▉    | 2512/4223 [00:40<00:27, 63.25it/s][A
 60%|█████▉    | 2520/4223 [00:40<00:26, 64.47it/s][A
 60%|█████▉    | 2529/4223 [00:40<00:24, 69.41it/s][A
 60%|██████    | 2537/4223 [00:40<00:26, 64.27it/s][A
 60%|██████    | 2544/4223 [00:40<00:26, 62.64it/s][A
 60%|██████    | 2551/4223 [00:41<00:27, 61.12it/s][A
 61%|██████    | 2558/4223 [00:41<00:27, 59.50it/s][A
 61%|██████    | 2568/4223 [00:41<00:25, 64.23it/s][A
 61%|██████    | 2577/4223 [00:41<00:25, 64.72it/s][A
 61%|██████▏   | 2588/4223 [00:41<00:24, 67.70it/s][A
 62%|██████▏   | 2599/4223 [00:41<00:21, 76.10it/s][A
 62%|██████▏   | 2608/4223 [00:41<00:22, 71.76it/s][A
 62%|██████▏   | 2616/4223 [00:41<00:22, 71.99it/s][A
 62%|██████▏   | 2625/4223 [00:42<00:21, 73.09it/s][A
 62%|██████▏   | 2633/4223 [00:42<00:22, 70.59it/s][A
 63%|██████▎   | 2641/4223 [00:42<00:22, 69.28it/s][A
 63%|█████

 87%|████████▋ | 3681/4223 [00:58<00:09, 59.06it/s][A
 87%|████████▋ | 3688/4223 [00:58<00:09, 56.70it/s][A
 87%|████████▋ | 3694/4223 [00:58<00:09, 56.49it/s][A
 88%|████████▊ | 3703/4223 [00:58<00:08, 59.11it/s][A
 88%|████████▊ | 3710/4223 [00:58<00:08, 59.97it/s][A
 88%|████████▊ | 3717/4223 [00:58<00:08, 57.04it/s][A
 88%|████████▊ | 3724/4223 [00:58<00:08, 59.72it/s][A
 88%|████████▊ | 3731/4223 [00:59<00:08, 59.31it/s][A
 89%|████████▊ | 3740/4223 [00:59<00:07, 62.03it/s][A
 89%|████████▉ | 3748/4223 [00:59<00:07, 60.22it/s][A
 89%|████████▉ | 3759/4223 [00:59<00:06, 67.39it/s][A
 89%|████████▉ | 3767/4223 [00:59<00:06, 67.61it/s][A
 89%|████████▉ | 3774/4223 [00:59<00:07, 62.27it/s][A
 90%|████████▉ | 3781/4223 [00:59<00:07, 59.76it/s][A
 90%|████████▉ | 3788/4223 [00:59<00:07, 59.12it/s][A
 90%|████████▉ | 3797/4223 [01:00<00:06, 63.96it/s][A
 90%|█████████ | 3807/4223 [01:00<00:05, 70.12it/s][A
 90%|█████████ | 3815/4223 [01:00<00:06, 64.09it/s][A
 91%|█████

 15%|█▍        | 583/3955 [00:08<00:44, 75.22it/s][A
 15%|█▍        | 591/3955 [00:09<00:45, 73.36it/s][A
 15%|█▌        | 599/3955 [00:09<00:47, 71.35it/s][A
 15%|█▌        | 610/3955 [00:09<00:42, 79.61it/s][A
 16%|█▌        | 619/3955 [00:09<00:45, 73.54it/s][A
 16%|█▌        | 627/3955 [00:09<00:46, 71.63it/s][A
 16%|█▌        | 635/3955 [00:09<00:47, 70.31it/s][A
 16%|█▋        | 644/3955 [00:09<00:46, 70.91it/s][A
 16%|█▋        | 652/3955 [00:09<00:45, 73.03it/s][A
 17%|█▋        | 660/3955 [00:10<00:48, 67.26it/s][A
 17%|█▋        | 667/3955 [00:10<00:49, 66.47it/s][A
 17%|█▋        | 675/3955 [00:10<00:47, 69.44it/s][A
 17%|█▋        | 683/3955 [00:10<00:46, 69.90it/s][A
 17%|█▋        | 691/3955 [00:10<00:47, 68.51it/s][A
 18%|█▊        | 698/3955 [00:10<00:47, 68.79it/s][A
 18%|█▊        | 709/3955 [00:10<00:44, 72.63it/s][A
 18%|█▊        | 719/3955 [00:10<00:41, 78.47it/s][A
 18%|█▊        | 728/3955 [00:10<00:42, 76.51it/s][A
 19%|█▊        | 736/3955 [0

 44%|████▎     | 1722/3955 [00:27<00:46, 47.74it/s][A
 44%|████▎     | 1727/3955 [00:27<00:47, 46.73it/s][A
 44%|████▍     | 1732/3955 [00:27<00:49, 45.20it/s][A
 44%|████▍     | 1737/3955 [00:27<00:49, 44.80it/s][A
 44%|████▍     | 1743/3955 [00:27<00:47, 46.45it/s][A
 44%|████▍     | 1748/3955 [00:27<00:47, 46.13it/s][A
 44%|████▍     | 1753/3955 [00:27<00:47, 46.59it/s][A
 44%|████▍     | 1759/3955 [00:27<00:47, 46.41it/s][A
 45%|████▍     | 1769/3955 [00:28<00:42, 51.46it/s][A
 45%|████▍     | 1775/3955 [00:28<00:41, 52.17it/s][A
 45%|████▌     | 1781/3955 [00:28<00:43, 50.51it/s][A
 45%|████▌     | 1787/3955 [00:28<00:42, 50.85it/s][A
 45%|████▌     | 1793/3955 [00:28<00:42, 51.12it/s][A
 46%|████▌     | 1800/3955 [00:28<00:39, 54.40it/s][A
 46%|████▌     | 1807/3955 [00:28<00:40, 53.04it/s][A
 46%|████▌     | 1814/3955 [00:28<00:38, 55.37it/s][A
 46%|████▌     | 1820/3955 [00:29<00:39, 53.77it/s][A
 46%|████▌     | 1826/3955 [00:29<00:40, 53.05it/s][A
 46%|████▋

 69%|██████▉   | 2723/3955 [00:45<00:20, 60.24it/s][A
 69%|██████▉   | 2730/3955 [00:45<00:20, 58.69it/s][A
 69%|██████▉   | 2737/3955 [00:45<00:21, 55.55it/s][A
 69%|██████▉   | 2744/3955 [00:45<00:21, 55.99it/s][A
 70%|██████▉   | 2751/3955 [00:45<00:22, 53.66it/s][A
 70%|██████▉   | 2758/3955 [00:46<00:21, 56.21it/s][A
 70%|██████▉   | 2764/3955 [00:46<00:21, 56.07it/s][A
 70%|███████   | 2770/3955 [00:46<00:22, 53.31it/s][A
 70%|███████   | 2776/3955 [00:46<00:22, 52.76it/s][A
 70%|███████   | 2786/3955 [00:46<00:20, 56.69it/s][A
 71%|███████   | 2794/3955 [00:46<00:19, 60.08it/s][A
 71%|███████   | 2801/3955 [00:46<00:19, 60.21it/s][A
 71%|███████   | 2808/3955 [00:46<00:18, 62.19it/s][A
 71%|███████   | 2815/3955 [00:47<00:19, 59.86it/s][A
 71%|███████▏  | 2825/3955 [00:47<00:18, 62.31it/s][A
 72%|███████▏  | 2832/3955 [00:47<00:17, 64.39it/s][A
 72%|███████▏  | 2839/3955 [00:47<00:18, 61.32it/s][A
 72%|███████▏  | 2846/3955 [00:47<00:18, 59.05it/s][A
 72%|█████

 97%|█████████▋| 3854/3955 [01:03<00:01, 54.59it/s][A
 98%|█████████▊| 3860/3955 [01:03<00:01, 53.51it/s][A
 98%|█████████▊| 3866/3955 [01:03<00:01, 52.30it/s][A
 98%|█████████▊| 3872/3955 [01:03<00:01, 53.01it/s][A
 98%|█████████▊| 3880/3955 [01:03<00:01, 57.39it/s][A
 98%|█████████▊| 3886/3955 [01:04<00:01, 51.75it/s][A
 98%|█████████▊| 3892/3955 [01:04<00:01, 53.29it/s][A
 99%|█████████▊| 3898/3955 [01:04<00:01, 54.50it/s][A
 99%|█████████▊| 3905/3955 [01:04<00:00, 57.77it/s][A
 99%|█████████▉| 3911/3955 [01:04<00:00, 53.67it/s][A
 99%|█████████▉| 3917/3955 [01:04<00:00, 50.31it/s][A
 99%|█████████▉| 3924/3955 [01:04<00:00, 54.11it/s][A
 99%|█████████▉| 3932/3955 [01:04<00:00, 59.51it/s][A
100%|█████████▉| 3939/3955 [01:05<00:00, 49.06it/s][A
100%|█████████▉| 3947/3955 [01:05<00:00, 54.58it/s][A
100%|██████████| 3955/3955 [01:05<00:00, 60.53it/s][A
 62%|██████▎   | 5/8 [04:01<02:43, 54.38s/it]
  0%|          | 0/3767 [00:00<?, ?it/s][A
  0%|          | 5/3767 [00:00

 27%|██▋       | 1015/3767 [00:16<00:46, 59.56it/s][A
 27%|██▋       | 1022/3767 [00:16<00:50, 54.74it/s][A
 27%|██▋       | 1029/3767 [00:16<00:51, 52.67it/s][A
 27%|██▋       | 1035/3767 [00:16<00:50, 53.84it/s][A
 28%|██▊       | 1041/3767 [00:16<00:52, 51.78it/s][A
 28%|██▊       | 1047/3767 [00:16<00:51, 53.08it/s][A
 28%|██▊       | 1057/3767 [00:17<00:47, 57.62it/s][A
 28%|██▊       | 1063/3767 [00:17<00:48, 56.06it/s][A
 28%|██▊       | 1069/3767 [00:17<00:50, 53.62it/s][A
 29%|██▊       | 1075/3767 [00:17<00:49, 54.56it/s][A
 29%|██▊       | 1083/3767 [00:17<00:46, 57.48it/s][A
 29%|██▉       | 1089/3767 [00:17<00:47, 56.48it/s][A
 29%|██▉       | 1095/3767 [00:17<00:49, 54.02it/s][A
 29%|██▉       | 1101/3767 [00:17<00:49, 53.44it/s][A
 29%|██▉       | 1107/3767 [00:18<00:51, 51.86it/s][A
 30%|██▉       | 1115/3767 [00:18<00:46, 57.41it/s][A
 30%|██▉       | 1121/3767 [00:18<00:50, 51.92it/s][A
 30%|██▉       | 1127/3767 [00:18<00:51, 51.31it/s][A
 30%|███  

 57%|█████▋    | 2142/3767 [00:34<00:25, 62.85it/s][A
 57%|█████▋    | 2149/3767 [00:34<00:26, 61.62it/s][A
 57%|█████▋    | 2156/3767 [00:34<00:27, 59.63it/s][A
 57%|█████▋    | 2164/3767 [00:34<00:26, 61.20it/s][A
 58%|█████▊    | 2172/3767 [00:34<00:24, 65.32it/s][A
 58%|█████▊    | 2179/3767 [00:34<00:25, 63.04it/s][A
 58%|█████▊    | 2186/3767 [00:34<00:26, 60.15it/s][A
 58%|█████▊    | 2193/3767 [00:35<00:25, 61.73it/s][A
 58%|█████▊    | 2201/3767 [00:35<00:23, 65.68it/s][A
 59%|█████▊    | 2208/3767 [00:35<00:24, 63.40it/s][A
 59%|█████▉    | 2215/3767 [00:35<00:25, 60.54it/s][A
 59%|█████▉    | 2222/3767 [00:35<00:26, 58.71it/s][A
 59%|█████▉    | 2230/3767 [00:35<00:25, 60.35it/s][A
 59%|█████▉    | 2238/3767 [00:35<00:23, 64.38it/s][A
 60%|█████▉    | 2246/3767 [00:35<00:23, 65.97it/s][A
 60%|█████▉    | 2253/3767 [00:36<00:23, 63.76it/s][A
 60%|█████▉    | 2260/3767 [00:36<00:25, 58.26it/s][A
 60%|██████    | 2268/3767 [00:36<00:25, 59.59it/s][A
 60%|█████

 82%|████████▏ | 3078/3767 [00:52<00:17, 39.76it/s][A
 82%|████████▏ | 3083/3767 [00:52<00:17, 39.79it/s][A
 82%|████████▏ | 3088/3767 [00:52<00:16, 41.13it/s][A
 82%|████████▏ | 3094/3767 [00:52<00:14, 45.37it/s][A
 82%|████████▏ | 3099/3767 [00:52<00:14, 45.10it/s][A
 82%|████████▏ | 3104/3767 [00:53<00:14, 45.94it/s][A
 83%|████████▎ | 3113/3767 [00:53<00:13, 48.89it/s][A
 83%|████████▎ | 3121/3767 [00:53<00:12, 51.24it/s][A
 83%|████████▎ | 3128/3767 [00:53<00:12, 51.31it/s][A
 83%|████████▎ | 3134/3767 [00:53<00:12, 48.78it/s][A
 83%|████████▎ | 3141/3767 [00:53<00:12, 48.94it/s][A
 84%|████████▎ | 3147/3767 [00:53<00:12, 48.19it/s][A
 84%|████████▎ | 3153/3767 [00:54<00:12, 50.85it/s][A
 84%|████████▍ | 3159/3767 [00:54<00:12, 48.84it/s][A
 84%|████████▍ | 3164/3767 [00:54<00:13, 43.73it/s][A
 84%|████████▍ | 3169/3767 [00:54<00:14, 42.33it/s][A
 84%|████████▍ | 3174/3767 [00:54<00:15, 38.43it/s][A
 84%|████████▍ | 3178/3767 [00:54<00:16, 36.71it/s][A
 85%|█████

 12%|█▏        | 229/1982 [00:04<00:36, 48.12it/s][A
 12%|█▏        | 235/1982 [00:04<00:37, 46.08it/s][A
 12%|█▏        | 240/1982 [00:04<00:38, 45.67it/s][A
 12%|█▏        | 245/1982 [00:04<00:37, 45.79it/s][A
 13%|█▎        | 250/1982 [00:05<00:38, 44.77it/s][A
 13%|█▎        | 255/1982 [00:05<00:39, 43.55it/s][A
 13%|█▎        | 261/1982 [00:05<00:39, 44.04it/s][A
 14%|█▎        | 268/1982 [00:05<00:37, 45.42it/s][A
 14%|█▍        | 276/1982 [00:05<00:35, 47.50it/s][A
 14%|█▍        | 282/1982 [00:05<00:33, 50.14it/s][A
 15%|█▍        | 290/1982 [00:05<00:30, 55.17it/s][A
 15%|█▍        | 296/1982 [00:05<00:31, 53.13it/s][A
 15%|█▌        | 302/1982 [00:06<00:32, 51.04it/s][A
 16%|█▌        | 309/1982 [00:06<00:31, 52.31it/s][A
 16%|█▌        | 315/1982 [00:06<00:31, 53.07it/s][A
 16%|█▌        | 321/1982 [00:06<00:30, 54.20it/s][A
 17%|█▋        | 328/1982 [00:06<00:28, 57.10it/s][A
 17%|█▋        | 334/1982 [00:06<00:30, 53.17it/s][A
 17%|█▋        | 341/1982 [0

 59%|█████▉    | 1170/1982 [00:23<00:21, 37.34it/s][A
 59%|█████▉    | 1175/1982 [00:23<00:20, 39.33it/s][A
 60%|█████▉    | 1180/1982 [00:23<00:20, 39.96it/s][A
 60%|█████▉    | 1185/1982 [00:23<00:18, 41.97it/s][A
 60%|██████    | 1190/1982 [00:23<00:19, 40.36it/s][A
 60%|██████    | 1196/1982 [00:23<00:19, 40.92it/s][A
 61%|██████    | 1202/1982 [00:23<00:17, 44.51it/s][A
 61%|██████    | 1207/1982 [00:24<00:18, 42.20it/s][A
 61%|██████    | 1213/1982 [00:24<00:16, 45.99it/s][A
 62%|██████▏   | 1219/1982 [00:24<00:16, 46.85it/s][A
 62%|██████▏   | 1226/1982 [00:24<00:16, 46.88it/s][A
 62%|██████▏   | 1236/1982 [00:24<00:13, 54.49it/s][A
 63%|██████▎   | 1242/1982 [00:24<00:13, 55.71it/s][A
 63%|██████▎   | 1249/1982 [00:24<00:12, 59.00it/s][A
 63%|██████▎   | 1256/1982 [00:24<00:13, 53.71it/s][A
 64%|██████▍   | 1265/1982 [00:25<00:12, 56.11it/s][A
 64%|██████▍   | 1274/1982 [00:25<00:11, 62.76it/s][A
 65%|██████▍   | 1281/1982 [00:25<00:11, 62.35it/s][A
 65%|█████

  7%|▋         | 142/2086 [00:02<00:36, 52.80it/s][A
  7%|▋         | 148/2086 [00:02<00:37, 51.49it/s][A
  7%|▋         | 156/2086 [00:03<00:35, 53.77it/s][A
  8%|▊         | 165/2086 [00:03<00:32, 58.86it/s][A
  8%|▊         | 172/2086 [00:03<00:32, 58.15it/s][A
  9%|▊         | 178/2086 [00:03<00:34, 55.90it/s][A
  9%|▉         | 185/2086 [00:03<00:32, 58.80it/s][A
  9%|▉         | 192/2086 [00:03<00:31, 60.21it/s][A
 10%|▉         | 200/2086 [00:03<00:29, 64.84it/s][A
 10%|▉         | 207/2086 [00:03<00:29, 64.23it/s][A
 10%|█         | 214/2086 [00:03<00:30, 62.07it/s][A
 11%|█         | 221/2086 [00:04<00:31, 58.33it/s][A
 11%|█         | 229/2086 [00:04<00:31, 58.71it/s][A
 11%|█▏        | 237/2086 [00:04<00:32, 56.81it/s][A
 12%|█▏        | 244/2086 [00:04<00:31, 58.89it/s][A
 12%|█▏        | 250/2086 [00:04<00:32, 56.81it/s][A
 12%|█▏        | 256/2086 [00:04<00:31, 57.36it/s][A
 13%|█▎        | 262/2086 [00:04<00:34, 53.08it/s][A
 13%|█▎        | 269/2086 [0

 57%|█████▋    | 1192/2086 [00:21<00:13, 64.93it/s][A
 57%|█████▋    | 1199/2086 [00:21<00:14, 63.02it/s][A
 58%|█████▊    | 1206/2086 [00:21<00:14, 61.38it/s][A
 58%|█████▊    | 1213/2086 [00:21<00:15, 56.90it/s][A
 59%|█████▊    | 1222/2086 [00:21<00:14, 60.84it/s][A
 59%|█████▉    | 1229/2086 [00:21<00:13, 63.30it/s][A
 59%|█████▉    | 1236/2086 [00:21<00:14, 60.10it/s][A
 60%|█████▉    | 1243/2086 [00:21<00:14, 59.39it/s][A
 60%|█████▉    | 1250/2086 [00:22<00:14, 57.61it/s][A
 60%|██████    | 1256/2086 [00:22<00:16, 51.77it/s][A
 61%|██████    | 1264/2086 [00:22<00:15, 53.79it/s][A
 61%|██████    | 1272/2086 [00:22<00:13, 59.00it/s][A
 61%|██████▏   | 1279/2086 [00:22<00:13, 59.16it/s][A
 62%|██████▏   | 1286/2086 [00:22<00:14, 53.54it/s][A
 62%|██████▏   | 1292/2086 [00:22<00:15, 51.58it/s][A
 62%|██████▏   | 1298/2086 [00:22<00:14, 52.54it/s][A
 63%|██████▎   | 1305/2086 [00:23<00:15, 50.34it/s][A
 63%|██████▎   | 1311/2086 [00:23<00:14, 52.62it/s][A
 63%|█████

 14%|█▍        | 133/928 [00:02<00:12, 65.17it/s][A
 15%|█▌        | 141/928 [00:02<00:11, 68.80it/s][A
 16%|█▌        | 148/928 [00:02<00:11, 65.87it/s][A
 17%|█▋        | 155/928 [00:02<00:12, 63.59it/s][A
 17%|█▋        | 162/928 [00:02<00:12, 61.61it/s][A
 18%|█▊        | 171/928 [00:02<00:11, 67.70it/s][A
 19%|█▉        | 179/928 [00:02<00:11, 63.72it/s][A
 20%|██        | 186/928 [00:02<00:12, 59.98it/s][A
 21%|██        | 193/928 [00:03<00:12, 56.61it/s][A
 22%|██▏       | 201/928 [00:03<00:12, 58.77it/s][A
 23%|██▎       | 209/928 [00:03<00:11, 62.87it/s][A
 23%|██▎       | 216/928 [00:03<00:11, 63.32it/s][A
 24%|██▍       | 223/928 [00:03<00:11, 59.54it/s][A
 25%|██▍       | 230/928 [00:03<00:11, 58.85it/s][A
 26%|██▌       | 238/928 [00:03<00:11, 59.95it/s][A
 27%|██▋       | 247/928 [00:03<00:10, 65.08it/s][A
 27%|██▋       | 254/928 [00:04<00:10, 64.80it/s][A
 28%|██▊       | 261/928 [00:04<00:11, 60.23it/s][A
 29%|██▉       | 268/928 [00:04<00:11, 57.16it

 66%|██████▌   | 263/401 [00:05<00:02, 50.83it/s][A
 67%|██████▋   | 269/401 [00:05<00:03, 43.67it/s][A
 69%|██████▉   | 276/401 [00:05<00:02, 46.71it/s][A
 71%|███████   | 283/401 [00:05<00:02, 51.03it/s][A
 72%|███████▏  | 289/401 [00:06<00:02, 52.92it/s][A
 74%|███████▎  | 295/401 [00:06<00:02, 51.08it/s][A
 75%|███████▌  | 301/401 [00:06<00:01, 52.71it/s][A
 77%|███████▋  | 308/401 [00:06<00:01, 52.14it/s][A
 78%|███████▊  | 314/401 [00:06<00:01, 49.37it/s][A
 80%|███████▉  | 320/401 [00:06<00:01, 47.07it/s][A
 81%|████████▏ | 326/401 [00:06<00:01, 44.55it/s][A
 83%|████████▎ | 333/401 [00:06<00:01, 49.22it/s][A
 85%|████████▍ | 339/401 [00:07<00:01, 51.76it/s][A
 86%|████████▌ | 345/401 [00:07<00:01, 50.47it/s][A
 88%|████████▊ | 351/401 [00:07<00:00, 50.66it/s][A
 89%|████████▉ | 357/401 [00:07<00:00, 47.55it/s][A
 91%|█████████ | 363/401 [00:07<00:00, 50.39it/s][A
 92%|█████████▏| 369/401 [00:07<00:00, 50.49it/s][A
 94%|█████████▎| 375/401 [00:07<00:00, 48.83it

In [3]:
# suffle the emerging text and existing text
import random

random.shuffle(existing_text_list)
random.shuffle(emerging_text_list)

for existing_text in existing_text_list:
    existing_text_file.write(existing_text)

for emerging_text in emerging_text_list:
    emerging_text_file.write(emerging_text)


In [14]:
for k in custom_dataset:
    print (f"key :{k}, {len(custom_dataset[k])}")
    

NameError: name 'custom_dataset' is not defined