In [None]:
import numpy as np
import os
import pandas as pd
import sh

from collections import Counter
from lxml import etree
from tqdm import tqdm_notebook

In [11]:
base_directory = './word_sense_disambigation_corpora/'
sentences = 1

with open('google_wsd.col', 'w') as fout:
    for file in tqdm_notebook(sh.find('%smasc' % base_directory, '%ssemcor' % base_directory,
                                      '-type', 'f', '-name', '*.xml')):
        root = etree.parse(file.strip()).getroot()

        sentence = []
        verb_senses = []
        for word in root.findall('word'):
            if word.attrib['text'].strip() == '':
                continue
            
            break_level = word.attrib['break_level']
            
            docname = file.strip().replace(base_directory, '')
            corpus = 'masc' if docname.startswith('masc') else 'semcor'
            domain = 'semcor' if corpus == 'semcor' else os.path.dirname(docname)[5:]

            if break_level == 'PARAGRAPH_BREAK' or break_level == 'SENTENCE_BREAK':
                for vidx, token, lemma, sense in verb_senses:
                    meta_string = 'META:%s\tsentence:%05d\t' % (corpus, sentences)
                    meta_string += 'doc:%s\t' % docname
                    meta_string += 'domain:%s\t' % domain
                    meta_string += 'main_lemma:%s\t' % lemma
                    meta_string += 'main_lemma_index:%d\t' % vidx
                    meta_string += 'main_token:%s\t' % token
                    meta_string += 'sense:%s' % sense
                    print(meta_string, file=fout)
                    print('\n'.join(sentence), file=fout, end='\n\n')
                    sentences += 1
                sentence = []
                verb_senses = []

            sidx = len(sentence) + 1
            if 'pos' in word.attrib and word.attrib['pos'] == 'VERB':
                verb_senses.append((sidx, word.attrib['text'], word.attrib['lemma'], word.attrib['sense']))
                sentence.append('%d\t%s\t%s\t-' % (sidx, word.attrib['text'], word.attrib['lemma']))
            else:
                sentence.append('%d\t%s\t-\t-' % (sidx, word.attrib['text']))




In [None]:
sentences = []
last_meta = {}
with open('./google_wsd.conll', 'r') as fin:
    for line in fin:
        if line.startswith('META'):
            last_meta = dict(w.split(':', 1) for w in line.strip().split())
            last_meta['sense'] = last_meta['sense'].split('/')[-1]
        try:
            if line.strip().split()[0] == last_meta['main_lemma_index']:
                last_meta['correctly_lemmatized'] = last_meta['main_lemma'] == line.strip().split()[2]
                sentences.append(last_meta)
        except IndexError:
            continue

sentences = pd.DataFrame(sentences, columns=['META', 'sentence', 'doc', 'domain', 'main_lemma',
                                             'main_lemma_index', 'main_token', 'sense', 'correctly_lemmatized'])

In [None]:
sentences['domain_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense', 'domain'])['sentence'].transform('count')

sentences['sense_sentence_count'] = sentences\
    .groupby(['main_lemma', 'sense'])['sentence'].transform('count')

sentences['lemma_sentence_count'] = sentences\
    .groupby(['main_lemma'])['sentence'].transform('count')
    
sentences['sense_count'] = sentences\
    .groupby(['main_lemma'])['sense']\
    .transform(lambda x: x.nunique())

sentences['senses_over_threshold'] = sentences['main_lemma']\
    .map(sentences.groupby('main_lemma')\
    .apply(lambda x: x.loc[x.sense_sentence_count >= 3, 'sense'].nunique()))

sentences['is_valid'] = (sentences['senses_over_threshold'] > 1) & (sentences['sense_sentence_count'] >= 3)

In [None]:
sentences[sentences.is_valid].groupby(['main_lemma', 'sense']).first()['sense_sentence_count'].describe()