In [1]:
import os
import argparse
import pandas as pd
import csv
from collections import Counter
import numpy as np
from pathlib import Path

In [38]:
def agg_annotations(l, min_annots=3, min_agreed=3):
    '''
    Leaves only uses annotated at least by min_annots annotators, among which 
    at least min_agreed returned the same sense.
    '''
    if len(l) < min_annots:
        return None
    # take the most frequent answer, or None if less than min_agreed annotators agreed
    # in case of tie, among the most frequent answers most_common() returns the first appeared one 
    label, cnt = Counter(l).most_common(1)[0]
    return None if cnt < min_agreed else label

def load_sense_labels(judgments_senses_path):
    '''
    Loads sense annotations, aggregates annotations by multiple annotators for a singe use.
    '''
    df = pd.read_csv(judgments_senses_path, delimiter="\t")
    senses = pd.read_csv(judgments_senses_path.parent / 'senses.csv', delimiter="\t")
    df = pd.merge(df, senses, on='identifier_sense')
    df = df[~(df['description_sense'] == 'andere')] # 'andere' stands for 'all other senses'
    clusters = df.groupby('identifier')['identifier_sense'].apply(agg_annotations).dropna().reset_index()
    clusters = clusters.rename(columns={'identifier_sense': 'cluster'})
    return clusters

In [39]:
LANG = 'de'
DIR_NAME = f'dwug_{LANG}'
CLUSTERS_NAME = 'sense'
p = Path(DIR_NAME)

if CLUSTERS_NAME == 'sense':
    paths = p.glob(f'data/*/judgments_senses.csv')
    clusters = pd.concat([load_sense_labels(path) for path in paths], ignore_index=True)
else:
    paths = p.glob(f'clusters/{CLUSTERS_NAME}/*.csv')
    clusters = pd.concat([pd.read_csv(path,delimiter="\t", quoting=csv.QUOTE_NONE) for path in paths], ignore_index=True)


In [44]:
paths = p.glob('data/*/uses.csv')
uses = pd.concat([pd.read_csv(path,delimiter="\t", quoting=csv.QUOTE_NONE) for path in paths], ignore_index=True)
uses['grouping'] = uses.grouping.replace(1,'old').replace(2,'new')
rdf = clusters.merge(uses, on='identifier', how='inner', validate='1:1')
assert len(clusters)==len(rdf)
print(f'{len(uses)} uses loaded, {len(rdf)} have gold labels')
print('Uses loaded:',uses.grouping.value_counts().to_dict())
print('Uses with gold labels:',rdf.grouping.value_counts().to_dict())
rdf.head(5)

9125 uses loaded, 826 have gold labels
Uses loaded: {'new': 5000, 'old': 4125}
Uses with gold labels: {'new': 437, 'old': 389}


Unnamed: 0,identifier,cluster,lemma,pos,date,grouping,description,context,indexes_target_token,indexes_target_sentence,context_tokenized,indexes_target_token_tokenized,indexes_target_sentence_tokenized,context_lemmatized,context_pos
0,2532889X_1961-04-10_01_051.tcf.xml-4-2,sense3,überspannen,VVFIN,1961,new,,"Der Handlungsbogen überspannt vier Jahrzehnte,...",19:29,0:304,Der Handlungsbogen überspannt vier Jahrzehnte ...,2,0:53,"d Handlungsbogen überspannen vier Jahrzehnt , ...","ART NN VVFIN CARD NN $, VVFIN APPRART NN ART A..."
1,2532889X_1964-11-23_01_007.tcf.xml-3-10,sense3,überspannen,VVFIN,1964,new,,"Die Brücke, die Insgesamt 4,8 Kilometer lang i...",50:60,0:243,"Die Brücke , die Insgesamt 4,8 Kilometer lang ...",10,0:41,"d Brücke , die insgesamt 4,8 Kilometer lang se...","ART NN $, PRELS ADV CARD NN ADJD VAFIN $, VVFI..."
2,2532889X_1975-10-22_01_155.tcf.xml-2-23,sense3,überspannen,VVFIN,1975,new,,"• Mit der Fertigstellung der ersten 143 Meter,...",140:150,0:248,• Mit der Fertigstellung der ersten 143 Meter ...,23,0:38,"• mit d Fertigstellung d erst 143 Meter , lang...","$( APPR ART NN ART ADJA CARD NN $, ADJA NN $, ..."
3,2532889X_1975-11-24_01_038.tcf.xml-2-22,sense3,überspannen,VVFIN,1975,new,,"Eine 460 Meter lange Brücke, die das künftige ...",117:127,0:209,"Eine 460 Meter lange Brücke , die das künftige...",22,0:36,"eine 460 Meter lang Brücke , die d künftig neu...","ART CARD NN ADJA NN $, PRELS ART ADJA ADJA NN ..."
4,2532889X_1980-05-10_01_109.tcf.xml-5-7,sense3,überspannen,VVFIN,1980,new,,"Die Kadin-Brücke wurde 1470 fertiggestellt, si...",48:58,0:149,"Die Kadin-Brücke wurde 1470 fertiggestellt , s...",7,0:26,"d Kadin-Brücke werden 1470 fertigstellen , sie...","ART NN VAFIN CARD VVPP $, PPER VVFIN APPR ART ..."


In [54]:
def convert_save(df, fpath):
    res = pd.DataFrame({
        'context_id': df.identifier,
        'word': df.lemma,
        'gold_sense_id': df.cluster,
        'positions': df.indexes_target_token.str.replace(":","-"),
        'context': df.context
    })
    print(len(res), fpath)
    fpath.parent.mkdir(parents=True, exist_ok=True)
    res.to_csv(fpath, sep='\t', index=False, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True)

In [60]:
uses['cluster']=None

for df, path in [(rdf, p.parent), (uses, p.parent / '../datasets_unlabeled/se20lscd')]:
    for pdf, part in [(df.query('grouping=="old"'), 'old'), (df.query('grouping=="new"'), 'new'), (df, 'old+new')]:
        mask = pdf["indexes_target_token"].str.len() > 2  # old code, not sure if we need it
        assert mask.all(), pdf[~mask]
        convert_save(pdf, path / LANG / f'{CLUSTERS_NAME}-{part}.tsv')    

389 de/sense-old.tsv
437 de/sense-new.tsv
826 de/sense-old+new.tsv
4125 ../datasets_unlabeled/se20lscd/de/sense-old.tsv
5000 ../datasets_unlabeled/se20lscd/de/sense-new.tsv
9125 ../datasets_unlabeled/se20lscd/de/sense-old+new.tsv


In [None]:
parser = argparse.ArgumentParser()
parser.add_argument(dest="dir_name")
parser.add_argument(dest="clusters")
parser.add_argument(dest="output_path")
parser.add_argument('--grouping', action='store_true')
parser.add_argument('--split_groupings', action='store_true')
args = parser.parse_args()
DIR_NAME = args.dir_name
CLUSTERS_NAME = args.clusters
OUTPUT_PATH = args.output_path
GROUPING = args.grouping
SPLIT = args.split_groupings

if SPLIT and not GROUPING:
    print('Could not split into groupings without --grouping flags set!')
    exit()


data = pd.DataFrame()
if CLUSTERS_NAME != "sense":
    words = [filename[:-4] for filename in os.listdir("{}/clusters/{}".format(DIR_NAME, CLUSTERS_NAME))]
else:
    words = [filename for filename in os.listdir("{}/data".format(DIR_NAME)) if os.path.exists("{}/data/{}/judgments_senses.csv".format(DIR_NAME, filename))]
for word in sorted(words):
    try:
        sentenses = pd.read_csv("{}/data/{}/uses.csv".format(DIR_NAME, word), delimiter="\t", quoting=csv.QUOTE_NONE)
    except Exception as ex:
        print("can't download data from {}      {}".format("{}/data/{}/uses.csv".format(DIR_NAME, word), ex))
        continue
    if CLUSTERS_NAME != "sense":
        try:
            clusters = pd.read_csv("{}/clusters/{}/{}.csv".format(DIR_NAME, CLUSTERS_NAME, word).format(word), delimiter="\t")
        except Exception as ex:
            print("can't download data from {}      {}".format("{}/clusters/{}/{}.csv".format(DIR_NAME, CLUSTERS_NAME, word), ex))
            continue
        chunk_of_data = pd.merge(sentenses, clusters, left_on='identifier', right_on='identifier')
    else:
        try:
            threshold = 3 # threshold for majority labels
            clusters = pd.read_csv("{}/data/{}/judgments_senses.csv".format(DIR_NAME, word), delimiter="\t")
            senses = pd.read_csv("{}/data/{}/senses.csv".format(DIR_NAME, word), delimiter="\t")
            clusters = pd.merge(clusters, senses, left_on='identifier_sense', right_on="identifier_sense")
            clusters = clusters[~(clusters['description_sense'] == 'andere')] # remove andereinstances
            lemmas = clusters.groupby('identifier').agg({'lemma':lambda x: list(x)[0]})
            judgments = clusters.groupby('identifier')['identifier_sense'].apply(list).reset_index(name='judgments')
            clusters = pd.merge(lemmas, judgments, left_on='identifier', right_on="identifier")
            
            # Extract majority labels
            def extract_majority_label(judgments, threshold):
                judgments = list(judgments)
                label2count = Counter(judgments)
                majority_labels = [l for l, c in label2count.items() if c >= threshold]
                if len(majority_labels) > 0:
                    label = np.random.choice(majority_labels)
                else:
                    label = np.NaN  
                return label
            
            #clusters = clusters[clusters['judgments'].apply(lambda x: len(list(x))>threshold)] # remove instances with less than threshold remaining judgments, not needed for now
            #clusters = clusters[~clusters['judgments'].apply(lambda x: extract_majority_label(list(x), threshold)).isnull()] # remove instances which do not reach threshold for majority labeling, not needed for now
            clusters['identifier_sense'] = clusters['judgments'].apply(lambda x: extract_majority_label(list(x), threshold)) # add majority label column
            clusters = clusters[~clusters['identifier_sense'].isnull()] # remove instances which do not reach threshold for majority labeling
            #print(clusters)
        except Exception as ex:
            print("can't download data from {}      {}".format("{}/data/{}/judgments_senses.csv".format(DIR_NAME, word), ex))
            continue
        chunk_of_data = pd.merge(sentenses, clusters[["identifier", "identifier_sense"]], left_on='identifier', right_on='identifier')
        identifier_sense_to_id_mapping = {ident: idx for idx, ident in enumerate(pd.unique(chunk_of_data['identifier_sense']))}
        chunk_of_data["cluster"] = chunk_of_data['identifier_sense'].apply(lambda x: identifier_sense_to_id_mapping[x])
    data = pd.concat([data, chunk_of_data], ignore_index=True)

data["indexes_target_token"] = data["indexes_target_token"].str.replace(":", "-")
data = data[data["indexes_target_token"].str.len() > 2]
if GROUPING:
    bts_rnc_like_data = pd.DataFrame(
        dict(
            context_id=range(1, len(data['lemma']) + 1),
            word=data['lemma'],
            gold_sense_id=data['cluster'],
            positions=data["indexes_target_token"],
            context=data['context'],
            grouping=data['grouping']))
    if SPLIT:
        grpgs = bts_rnc_like_data['grouping'].unique()
        base_path = OUTPUT_PATH[:-4]
        extension = OUTPUT_PATH[-4:]
        if len(grpgs) > 1:
            for grp in grpgs:
                to_save = bts_rnc_like_data[bts_rnc_like_data['grouping']==grp]
                to_save.to_csv(base_path+'_'+str(grp)+extension, sep='\t', index=False, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True)
else:
    bts_rnc_like_data = pd.DataFrame(
        dict(
            context_id=range(1, len(data['lemma']) + 1),
            word=data['lemma'],
            gold_sense_id=data['cluster'],
            positions=data["indexes_target_token"],
            context=data['context']))
bts_rnc_like_data.to_csv(OUTPUT_PATH, sep='\t', index=False, quoting=csv.QUOTE_MINIMAL, quotechar='"', doublequote=True)