In [1]:
import glob
import json
from tqdm import tqdm
import pandas as pd
import os

In [2]:
regions = ['chungcheong','jeju','jeonla','kangwon','kyeongsang']

In [3]:
files = glob.glob('data/'+regions[-1]+'/*.zip')
files.sort()

In [4]:
files

['data/kyeongsang/[라벨]경상도_학습데이터_1.zip',
 'data/kyeongsang/[라벨]경상도_학습데이터_2.zip']

In [1]:
#!unzip -d data/kyeongsang/train data/kyeongsang/[라벨]경상도_학습데이터_1.zip
#!unzip -d data/kyeongsang/test data/kyeongsang/[라벨]경상도_학습데이터_2.zip

## json to df

In [2]:
def get_lst(filename):
    try:
        with open(filename,'r',encoding='utf-8-sig') as f:
            data=json.load(f)
        f.close()
    except:
        print(filename)
    standard = []
    dialect = []
    mp_d = []
    mp_s = []
    mp_idx = []
    for d in data['utterance']:
        # 다른 형태만 가져옴
        if d['standard_form'] != d['dialect_form']:
            standard.append(d['standard_form'])
            dialect.append(d['dialect_form'])
            tmp = []
            tmp2 = []
            tmp3 = []
            for el in d['eojeolList']:
                if el['isDialect']:
                    tmp.append(el['eojeol'])
                    tmp2.append(el['standard'])
                    tmp3.append(el['id'])
            mp_d.append('_'.join(tmp))
            mp_s.append('_'.join(tmp2))
            mp_idx.append(' '.join([str(t) for t in tmp3]))
    return standard, dialect, mp_d, mp_s, mp_idx

In [3]:
def extract_df(file_path, reg, tp):
    standard_lst, dial_lst, mp_d_lst, mp_s_lst, mp_idx_lst = [], [], [], [], []
    for fn in tqdm(file_path):
        try:
            standard, dialect, mp_d, mp_s, mp_idx = get_lst(fn)
        except:
            pass
        standard_lst+=standard
        dial_lst+=dialect
        mp_d_lst+=mp_d
        mp_s_lst+=mp_s
        mp_idx_lst+=mp_idx

    df=pd.DataFrame(columns=['standard','dialect','mp_d','mp_s','mp_idx'])
    df['standard'] = standard_lst
    df['dialect'] = dial_lst
    df['mp_d'] = mp_d_lst
    df['mp_s'] = mp_s_lst
    df['mp_idx'] = mp_idx_lst
    save_path = 'data/{}/{}/{}_data.tsv'.format(reg,tp,tp)
    df.to_csv(save_path,index=False,sep='\t')
    print('saved in {}'.format(save_path))
    

In [29]:
regions = ['jeju','jeonla','kangwon','kyeongsang' , 'chungcheong']
types= ['train','test']

In [30]:
for r in regions:
    for t in types:
        files = glob.glob('data/{}/{}/*.json'.format(r,t))
        print(r,t)
        extract_df(files, r, t)

  0%|          | 2/5042 [00:00<04:49, 17.39it/s]

jeju train


100%|██████████| 5042/5042 [02:16<00:00, 37.05it/s]
  2%|▏         | 9/596 [00:00<00:07, 81.53it/s]

saved in data/jeju/train/train_data.tsv
jeju test


100%|██████████| 596/596 [00:04<00:00, 127.04it/s]
  0%|          | 0/7412 [00:00<?, ?it/s]

saved in data/jeju/test/test_data.tsv
jeonla train


100%|██████████| 7412/7412 [02:00<00:00, 61.28it/s]
  0%|          | 4/994 [00:00<00:29, 33.24it/s]

saved in data/jeonla/train/train_data.tsv
jeonla test


100%|██████████| 994/994 [00:16<00:00, 60.26it/s]
  0%|          | 6/4717 [00:00<01:24, 56.03it/s]

saved in data/jeonla/test/test_data.tsv
kangwon train


100%|██████████| 4717/4717 [01:19<00:00, 59.09it/s]
  0%|          | 4/828 [00:00<00:21, 38.20it/s]

saved in data/kangwon/train/train_data.tsv
kangwon test


100%|██████████| 828/828 [00:17<00:00, 48.60it/s]
  0%|          | 14/7699 [00:00<00:56, 135.10it/s]

saved in data/kangwon/test/test_data.tsv
kyeongsang train


100%|██████████| 7699/7699 [00:56<00:00, 136.51it/s]
  0%|          | 0/843 [00:00<?, ?it/s]

saved in data/kyeongsang/train/train_data.tsv
kyeongsang test


100%|██████████| 843/843 [00:05<00:00, 142.49it/s]
  0%|          | 18/6048 [00:00<00:34, 176.18it/s]

saved in data/kyeongsang/test/test_data.tsv
chungcheong train


 15%|█▌        | 933/6048 [00:05<00:30, 166.72it/s]

data/chungcheong/train/DCNA20000088.json
data/chungcheong/train/DCNA20000274.json


 40%|███▉      | 2397/6048 [00:15<00:24, 148.76it/s]

data/chungcheong/train/DCNA20000292.json


 46%|████▋     | 2800/6048 [00:18<00:26, 122.65it/s]

data/chungcheong/train/DCNA20000040.json


 55%|█████▌    | 3336/6048 [00:22<00:20, 135.41it/s]

data/chungcheong/train/DCNA20000169.json


 58%|█████▊    | 3502/6048 [00:23<00:14, 178.12it/s]

data/chungcheong/train/DCNA20000213.json


 62%|██████▏   | 3764/6048 [00:25<00:17, 130.19it/s]

data/chungcheong/train/DCNA20000094.json


 63%|██████▎   | 3814/6048 [00:25<00:15, 145.83it/s]

data/chungcheong/train/DCNA20000098.json


 64%|██████▍   | 3888/6048 [00:25<00:13, 158.77it/s]

data/chungcheong/train/DCNA20000290.json


 69%|██████▊   | 4147/6048 [00:28<00:16, 117.41it/s]

data/chungcheong/train/DCNA20000346.json
data/chungcheong/train/DCNA20000260.json


 71%|███████   | 4272/6048 [00:28<00:10, 167.66it/s]

data/chungcheong/train/DCNA20000142.json


 85%|████████▍ | 5112/6048 [00:34<00:06, 149.33it/s]

data/chungcheong/train/DCNA20000086.json


 88%|████████▊ | 5345/6048 [00:36<00:05, 132.62it/s]

data/chungcheong/train/DCNA20000145.json


100%|██████████| 6048/6048 [00:41<00:00, 145.47it/s]
  2%|▏         | 13/780 [00:00<00:06, 125.81it/s]

saved in data/chungcheong/train/train_data.tsv
chungcheong test


100%|██████████| 780/780 [00:04<00:00, 169.04it/s]


saved in data/chungcheong/test/test_data.tsv


## remove raw files

In [31]:
import os

In [32]:
regions = ['chungcheong','jeju','jeonla','kangwon','kyeongsang']
types= ['train','test']

In [33]:
for r in regions:
    for t in types:
        for f in glob.glob('data/{}/{}/*.json'.format(r,t)):
            os.remove(f)
        for f in glob.glob('data/{}/{}/*.txt'.format(r,t)):
            os.remove(f)