In [465]:
import shutil
from os import listdir
import fcntl
from os.path import join
import pandas as pd
import re

raw_dr = './raw_tables/Korean'
new_raw = './auto_raw_tables'
dr = './annotated_tables/Korean'
newdr = './pred_annotated_tables'
cor_ann = './auto_annotated_tables'


In [468]:
def make_copies(pos, dr, new_raw, smp, newdr):
    
    for f in listdir(new_raw):
        if f in ['ADJ_000001_(21,8)_example.tsv', 'ADJ_000001_(30,8)_example.tsv']:
            shutil.copy(join(dr, f[:-3] + 'csv'), join(newdr, f[:-3] + 'tsv'))
        if pos in f and 'tsv' in f:
            if 'lock' not in f:
                shutil.copy(join(dr, smp), join(newdr, f))
            else:
                fcntl.flock(join(dr, f), fcntl.LOCK_UN)
                shutil.copy(join(dr, smp), join(newdr, f[7:-1]))

                
def clean_raw(raw_dr, new_raw):
    for f in listdir(raw_dr):
        df = pd.read_csv(join(raw_dr, f), sep=None, engine='python')
        
        f, ck = check_pos(df, f, new_raw)
        if not ck:
            print(f, '- too many tables')
            continue
            
        for i, row in enumerate(df.iterrows()):
            if 'Formal non-polite(해라체)' in row[1].unique():
                if pos == 'ADJ':
                    st = i
                    en = i + 34
                if pos == 'V':
                    st = i
                    en = i + 40
                break
                
        df = pd.DataFrame([i[1].values[0:6] for i in list(df.iterrows())[st:en]])
        
        for i in df.columns:
            df[i] = df[i].str.replace(u'(([\u3131-\u3163\uac00-\ud7a3]+, )?[\u3131-\u3163\uac00-\ud7a3]+)(-?[a-z, ]+)', u'\g<1>', regex=True)
        
        df.to_csv(join(new_raw, f[:-3] + 'tsv'), sep='\t', header=False, index=False)
        

def check_pos(df, fn, new_raw):
    if df[df.columns[0]].value_counts()['Indicative'] > 4:
        return fn, False
    if 'Imperative' in df[df.columns[0]].values or 'Hortative' in df[df.columns[0]].values or 'Motive' in df[df.columns[0]].values:
        pos = 'V'
    else:
        pos = 'ADJ'
    fn = fn.split('_')
    fn[0] = pos
    fn = '_'.join(fn)
    if fn in listdir(new_raw):
        fn = fn[:-4] + '_' + str(len(listdir(new_raw))) + '.csv'
    return fn, True


def correct_anno(newdr, new_raw, cor_ann):
    
    for f in listdir(newdr):
        if 'tsv' not in f and 'csv' not in f or f  in ['ADJ_000001_(21,8)_example.tsv', 'ADJ_000001_(30,8)_example.tsv']:
            continue
        df1 = pd.read_csv(join(newdr, f), sep=None, engine='python')
        df2 = pd.read_csv(join(new_raw, f), sep=None, engine='python')
            
        for i in df2.columns:
            nv = []
            if len(df2[i]) > len(df1[i]):
                df2 = df2.drop(range(len(df1[i]), len(df2[i])), axis=0)
            for j, rw in enumerate(df2[i]):
                anno = df1[i][j]
                if str(anno) == 'nan':
                    nv.append(anno)
                    continue
                an_0 = str(anno).split(', ')[0]
                nv.append(', '.join([an_0] * len(str(rw).split(', '))))
            if len(df1[i]) > len(df2[i]):
                df1 = df1.drop(range(len(df2[i]), len(df1[i])), axis=0)
            df1[i] = nv
        df1.to_csv(join(cor_ann, f[:-3] + 'tsv'), sep='\t', header=True, index=False)

In [469]:
clean_raw(raw_dr, new_raw)

V_000001_(137,8)_example.csv - too many tables
ADJ_000001_(137,8)_example.csv - too many tables
V_000001_(101,8)_example.csv - too many tables
V_000002_(80,8)_example.csv - too many tables
V_000002_(99,8)_example.csv - too many tables
ADJ_000008_(94,8)_example.csv - too many tables
ADJ_000001_(93,8)_example.csv - too many tables
V_000001_(108,8)_example.csv - too many tables
V_000008_(94,8)_example.csv - too many tables
ADJ_000001_(99,8)_example.csv - too many tables
V_000002_(95,8)_example.csv - too many tables
V_000001_(93,8)_example.csv - too many tables
ADJ_000006_(92,8)_example.csv - too many tables
V_000006_(92,8)_example.csv - too many tables
ADJ_000002_(95,8)_example.csv - too many tables


In [443]:
smp = 'ADJ_000001_(40,8)_example.csv'
pos = 'ADJ'
make_copies(pos, dr, new_raw, smp, newdr)

In [444]:
smp = 'verb_paradigm.csv'
pos = 'V'
make_copies(pos, dr, new_raw, smp, newdr)

In [460]:
correct_anno(newdr, new_raw, cor_ann)

In [467]:
for f in ['ADJ_000001_(21,8)_example.csv', 'ADJ_000001_(30,8)_example.csv']:
    df1 = pd.read_csv(join(dr, f), sep=None, engine='python')
    df1.to_csv(join(cor_ann, f[:-3] + 'tsv'), sep='\t', header=True, index=False)