In [107]:
import shutil
from os import listdir
import fcntl
from os.path import join
import pandas as pd
import re

raw_dr = './raw_tables/Korean'
new_raw = './nnew_auto_raw_tables/'
dr = './annotated_tables/Korean'
newdr = './nnew_pred_annotated_tables'
cor_ann = './nnew_auto_annotated_tables/'
if not os.path.exists(new_raw):
    os.makedirs(new_raw)
if not os.path.exists(newdr):
    os.makedirs(newdr)
if not os.path.exists(cor_ann):
    os.makedirs(cor_ann)

In [108]:
def make_copies(pos, dr, new_raw, smp, newdr):
    
    for f in listdir(new_raw):
        if f in ['ADJ_000001_(21,8)_example.csv', 'ADJ_000001_(30,8)_example.csv']:
            shutil.copy(join(dr, f), join(newdr, f))
        if pos in f and 'csv' in f:
            if 'lock' not in f:
                shutil.copy(join(dr, smp), join(newdr, f))
            else:
                fcntl.flock(join(dr, f), fcntl.LOCK_UN)
                shutil.copy(join(dr, smp), join(newdr, f[7:-1]))

                
def clean_raw(raw_dr, new_raw):
    for f in listdir(raw_dr):
        df = pd.read_csv(join(raw_dr, f), sep=None, engine='python')
        
        f, pos, ck = check_pos(df, f, new_raw)
        if not ck:
            print(f, '- too many tables')
            continue
        
        for i, row in enumerate(df.iterrows()):
            if 'Formal non-polite(해라체)' in row[1].unique():
                if pos == 'ADJ':
                    st = i
                    en = i + 34
                if pos == 'V':
                    st = i
                    en = i + 40
                break
                
        df = pd.DataFrame([i[1].values[0:6] for i in list(df.iterrows())[st:en]])
        
        for i in df.columns:
            df[i] = df[i].str.replace(u'(([\u3131-\u3163\uac00-\ud7a3]+, )?[\u3131-\u3163\uac00-\ud7a3]+)(-?[a-z, ]+)', u'\g<1>', regex=True)
        
        df.to_csv(join(new_raw, f), header=False, index=False)
        

def check_pos(df, fn, new_raw):
    print(fn)
    if df[df.columns[0]].value_counts()['Indicative'] > 4:
        return fn, None, False
    if 'Imperative' in df[df.columns[0]].values or 'Hortative' in df[df.columns[0]].values or 'Motive' in df[df.columns[0]].values:
        pos = 'V'
    else:
        pos = 'ADJ'
    nfn = fn.split('_')
    nfn[0] = pos
    nfn = '_'.join(nfn)
    if nfn in listdir(new_raw):
        nfn = nfn[:-4] + '_' + str(len(listdir(new_raw))) + '.csv'
    return nfn, pos, True


def correct_anno(newdr, new_raw, cor_ann):
    
    for f in listdir(newdr):
        if 'tsv' not in f and 'csv' not in f or f  in ['ADJ_000001_(21,8)_example.csv', 'ADJ_000001_(30,8)_example.csv']:
            continue
        df1 = pd.read_csv(join(newdr, f), sep=None, engine='python')
        df2 = pd.read_csv(join(new_raw, f), sep=None, engine='python')
            
        for i in df2.columns:
            nv = []
            if len(df2[i]) > len(df1[i]):
                df2 = df2.drop(range(len(df1[i]), len(df2[i])), axis=0)
            for j, rw in enumerate(df2[i]):
                anno = df1[i][j]
                if str(anno) == 'nan':
                    nv.append(anno)
                    continue
                an_0 = str(anno).split(', ')[0]
                nv.append(', '.join([an_0] * len(str(rw).split(', '))))
            if len(df1[i]) > len(df2[i]):
                df1 = df1.drop(range(len(df2[i]), len(df1[i])), axis=0)
            df1[i] = nv
        df1.to_csv(join(cor_ann, f), header=True, index=False)

In [109]:
clean_raw(raw_dr, new_raw)

V_000001_(137,8)_example.csv
V_000001_(137,8)_example.csv - too many tables
V_000002_(31,8)_example.csv
ADJ_000005_(49,8)_example.csv
V_000001_(76,14)_example.csv
V_000001_(63,8)_example.csv
ADJ_000006_(43,6)_example.csv
ADJ_000001_(137,8)_example.csv
ADJ_000001_(137,8)_example.csv - too many tables
V_000003_(47,10)_example.csv
V_000001_(101,8)_example.csv
V_000001_(101,8)_example.csv - too many tables
V_000001_(49,8)_example.csv
V_000001_(62,8)_example.csv
ADJ_000002_(73,8)_example.csv
V_000014_(47,8)_example.csv
V_000002_(80,8)_example.csv
V_000002_(80,8)_example.csv - too many tables
V_000001_(71,8)_example.csv
V_000002_(99,8)_example.csv
V_000002_(99,8)_example.csv - too many tables
ADJ_000001_(55,16)_example.csv
V_000001_(56,10)_example.csv
V_000001_(38,8)_example.csv
ADJ_000099_(38,8)_example.csv
V_000233_(45,8)_example.csv
V_000037_(56,8)_example.csv
ADJ_000008_(94,8)_example.csv
ADJ_000008_(94,8)_example.csv - too many tables
V_000002_(53,8)_example.csv
V_000001_(75,14)_example

In [146]:
smp = 'ADJ_000001_(40,8)_example.csv'
pos = 'ADJ'
make_copies(pos, dr, new_raw, smp, newdr)

In [147]:
smp = 'verb_paradigm.csv'
pos = 'V'
make_copies(pos, dr, new_raw, smp, newdr)

In [148]:
correct_anno(newdr, new_raw, cor_ann)

In [149]:
for f in ['ADJ_000001_(21,8)_example.csv', 'ADJ_000001_(30,8)_example.csv']:
    df1 = pd.read_csv(join(dr, f), sep=None, engine='python')
    df1.to_csv(join(cor_ann, f), header=False, index=False)

In [None]:
new_new_raw = './uni_raw'

def make_unimorph(new_raw, cor_ann, raw_dr):
    for f in listdir(raw_dr):
        df = pd.read_csv(join(raw_dr, f), sep=None, engine='python')
        f, ck = check_pos(df, f, new_new_raw)

    return fn, True

In [172]:
import codecs
import re
import pandas as pd
import argparse
from collections import defaultdict
import sys
import os

# gather arguments
'''parser = argparse.ArgumentParser(
    description="Extract tabular paradigms from annotated templates."
)
parser.add_argument(
    "-candidates_dir",
    action="store",
    dest="candidates_dir",
    help="Location of candidate html pages.",
)
parser.add_argument(
    "-annotation_dir",
    action="store",
    dest="annotation_dir",
    help="Location of raw/annotated table templates.",
)
parser.add_argument(
    "-language", action="store", dest="language", help="Language to grab."
)
args = parser.parse_args()'''

# regular expressions
lempat = r"<h1.*?>(.*?)</h1>"
locpat1 = r"</h2>.*?</h2>"
locpat2 = r"</h2>.*?</body>"
pospat = r">(.*?)</h3>"


# input tables
orig_dir = "./nnew_auto_raw_tables/"
  # original example tables for comparison #CHANGE
done_dir = "./nnew_auto_annotated_tables/"
 # annotated example tables #CHANGE

# output directory
out_dir = "./tabular_results/"  # output data goes here
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
candidates_dir = "./candidate_pages/"

# language
language = "Korean"

# output file
fout_name = out_dir + language + "_tabular_paradigms.txt"
fout = codecs.open(fout_name, "wb", "utf-8")

# get the table patterns
tables = {}


def clean_table(df):
    
    if "Indicative" not in df[df.columns[0]].values:
        return df, ""
    if df[df.columns[0]].value_counts()['Indicative'] > 4:
        return df, ""
    if "Imperative" in df[df.columns[0]].values or "Hortative" in df[df.columns[0]].values or "Motive" in df[df.columns[0]].values:
        pos = "V"
    else:
        pos = "ADJ"
            
    for i, row in enumerate(df.iterrows()):
        if "Formal non-polite(해라체)" in list(row[1].unique()) or "Formal non-polite|(해라체)" in list(row[1].unique()):
            if pos == "ADJ":
                st = i
                en = i + 34
            if pos == "V":
                st = i
                en = i + 40
            break
                
    df = pd.DataFrame([i[1].values[0:6] for i in list(df.iterrows())[st:en]])
        
    for i in df.columns:
        df[i] = df[i].str.replace(u"(([\u3131-\u3163\uac00-\ud7a3]+, )?[\u3131-\u3163\uac00-\ud7a3]+)(-?[a-z, ]+)", u"\g<1>", regex=True)
        
    #df.to_csv(join(new_raw, f[:-3] + 'tsv'), sep='\t', header=False, index=False)
    new_header = df.iloc[0] #grab the first row for the header
    df = df[1:] #take the data less the header row
    df.columns = new_header
    return df, pos


# loop through annotated directory
for n in os.listdir(done_dir):
    if n.endswith("csv"):
        mod_set = {}
        try:
            odata = pd.read_csv(orig_dir + n, sep=None, engine='python', header=0).fillna(
                ""
            )
            ddata = pd.read_csv(done_dir + n, sep=None, engine='python', header=0).fillna(
                ""
            )
        except:
            print(n)
            #raise
            continue
        try:
            assert odata.shape == ddata.shape  # make sure we really got the same table
        except:
            print(n)
            #print(odata)
            #print(ddata)
            print(odata.shape)
            print(ddata.shape)
            #raise
            continue
        for i in range(odata.shape[0]):
            for j in range(odata.shape[1]):
                if odata.iloc[i, j] != ddata.iloc[i, j]:
                    mod_set[(i, j)] = ddata.iloc[i, j]

        # if there were some actual annotations store them
        if len(mod_set) > 0:
            if n.startswith("N_"):
                tables["N_" + str(odata.shape)] = mod_set
            if n.startswith("ADJ_"):
                tables["ADJ_" + str(odata.shape)] = mod_set
            if n.startswith("V_"):
                tables["V_" + str(odata.shape)] = mod_set

k = 0
# #loop through languages
lnames = os.listdir(candidates_dir)  # CHANGE
for ln in lnames:
    if ln == language:
        names = os.listdir(os.path.join(candidates_dir, ln))  # CHANGE
        # loop through language pages
        # count = 0
        for n in names:
            # if n.startswith('candidate_33623.html'):
            if n.startswith("candidate"):
                fin = codecs.open(
                    os.path.join(candidates_dir, ln, n), "rb", "utf-8"
                )  # CHANGE
                page = fin.read()#.replace("<br>", "|")
                fin.close()

                # get the lemma from the page
                match = re.search(lempat, page, flags=re.U | re.DOTALL)
                if match:
                    lemma = match.group(1)
                    # print lemma
                    match = re.search(ln + locpat1, page, flags=re.U | re.DOTALL)
                    if not match:
                        match = re.search(ln + locpat2, page, flags=re.U | re.DOTALL)
                    if match:
                        text = match.group()
                        text = re.sub('colspan="100%"', 'colspan="0"', text, flags=re.U | re.DOTALL)
                        try:
                            data = pd.read_html(text)
                            if len(data) >= 1:
                                data = pd.concat(data)
                                data, pos = clean_table(data)
                                shape = data.shape
                                if pos + '_' + str(shape) in tables:
                                    for mod, feats in tables[pos + '_' + str(shape)].items():
                                        words = data.iloc[mod[0], mod[1]]
                                        if not pd.isnull(words):
                                            for word in words.split(', '): 
                                                if re.match(u'[a-zA-Z]+', word) is None and word != '':
                                                    fout.write(
                                                        re.search('>([^<]+)<', lemma).group(1)
                                                        + "\t"
                                                        + word
                                                        + "\t"
                                                        + feats.split(', ')[0]
                                                        + "\n"
                                                )
                            else:
                                k+=1
                        except:
                                # if data.shape == (6,8):
                                # 	print data
                            #raise
                            k+=1
                            #fout.write("----\t----\t----\n")
                            #fout.write("\n")
                            pass


# clean up
fout.close()

ADJ_000001_(55,16)_example.csv
(33, 6)
(32, 6)
ADJ_000003_(69,14)_example.csv
(33, 6)
(32, 6)
ADJ_000001_(21,8)_example.csv
ADJ_000001_(46,16)_example.csv
(33, 6)
(32, 6)
ADJ_000001_(30,8)_example.csv
(15, 6)
(29, 8)
ADJ_000001_(68,14)_example.csv
(33, 6)
(32, 6)


In [169]:
k

36

In [171]:
nl_raw = './non_lemmas_raw_tables/'
nl_anno = './non_lemmas_annotated_tables/'
fout_name = out_dir + language + "_non_lemmas_tabular_paradigms.txt"
fout = codecs.open(fout_name, "wb", "utf-8")
for n in os.listdir(nl_raw):
    with open (join(nl_raw, n), encoding='utf-8') as f:
        word = f.read().split('\n')[0]
    with open (join(nl_anno, n), encoding='utf-8') as f:
        feats = f.read().split('\n')[0]
    fout.write('lemma' + "\t" + word + "\t" + feats + "\n")
fout.close()

In [173]:
240139 + 12

240151