In [30]:
import json
from nltk import wordpunct_tokenize
from bs4 import BeautifulSoup as Soup
import networkx as nx
from collections import defaultdict
import copy

In [31]:
def parse_synset(file, all_synsets=None):
    handler = open(file).read()
    soup = Soup(handler)
    if all_synsets is None:
        all_synsets = {}
    for element in soup.findAll('synset'):
        all_synsets[element.attrs['id']] = {'name': element.attrs['ruthes_name'], 'definition': element.attrs['definition']}
    return all_synsets

In [32]:
def parse_senses(file):
    handler = open(file).read()
    soup = Soup(handler)
    all_senses = defaultdict(list)
    for element in soup.findAll('sense'):
        all_senses[element.attrs['synset_id']].append(element.attrs['name'])
    return all_senses

In [33]:
def parse_wordnet(file, synsets, senses=None, G=None, directed=False):
    if G is None:
        if directed:
            G = nx.DiGraph()
        else:
            G = nx.Graph()
    if directed and type(G) != nx.classes.digraph.DiGraph:
        raise Exception('Graph is not directed')
    if not directed and type(G) != nx.classes.digraph.Graph:
        raise Exception('Graph should not be directed')
    
    print('Input graph: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    handler = open(file).read()
    soup = Soup(handler)
    for element in soup.findAll('relation'):
        relation = element.attrs
        parent_id = relation['parent_id']
        child_id = relation['child_id']
        if relation['name'] in ['hyponym', 'instance hyponym']:
            if parent_id not in G.nodes:
                G.add_node(parent_id, in_edges=[], out_edges=[])
            if child_id not in G.nodes:
                G.add_node(child_id, in_edges=[], out_edges=[])
            parent = G.nodes[parent_id]
            child = G.nodes[child_id]
            G.add_edge(parent_id, child_id)
            if senses is not None:
                parent_txt = copy.deepcopy(senses[parent_id])
                child_txt = copy.deepcopy(senses[child_id])
            else:
                parent_txt = [synsets[parent_id]['name']]
                child_txt = [synsets[child_id]['name']]
            new_attr = {parent_id: {'out_edges': parent['out_edges'] + [child_id], 'text': parent_txt, 'definition': synsets[parent_id]['definition']},
                        child_id: {'in_edges': child['in_edges'] + [parent_id], 'text': child_txt, 'definition': synsets[child_id]['definition']}}
            nx.set_node_attributes(G, new_attr)
    print('Updated graph: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    for syn in synsets:
        if syn not in G.nodes:
            G.add_node(syn)
            txt = senses[syn]
            defn = synsets[syn]['definition']
            nx.set_node_attributes(G, {syn: {'out_edges': [], 'in_edges': [], 'text': txt, 'definition': defn}})
    print('Graph with orphan nodes: {} nodes, {} edges'.format(len(G.nodes), len(G.edges)))
    return G

In [34]:
all_senses_noun = parse_senses('ruwordnet/senses.N.xml')
all_senses_verb = parse_senses('ruwordnet/senses.V.xml')
all_synsets_noun = parse_synset('ruwordnet/synsets.N.xml')
all_synsets_verb = parse_synset('ruwordnet/synsets.V.xml')

In [35]:
# wordnet graphs - undirected
G_full_noun = parse_wordnet('ruwordnet/synset_relations.N.xml', all_synsets_noun, all_senses_noun)
G_full_verb = parse_wordnet('ruwordnet/synset_relations.V.xml', all_synsets_verb, all_senses_verb)

Input graph: 0 nodes, 0 edges
Updated graph: 29295 nodes, 39110 edges
Graph with orphan nodes: 29296 nodes, 39110 edges
Input graph: 0 nodes, 0 edges
Updated graph: 7408 nodes, 10317 edges
Graph with orphan nodes: 7521 nodes, 10317 edges


In [36]:
# wordnet graphs - directed
G_full_dir_noun = parse_wordnet('ruwordnet/synset_relations.N.xml', all_synsets_noun, all_senses_noun, directed=True)
G_full_dir_verb = parse_wordnet('ruwordnet/synset_relations.V.xml', all_synsets_verb, all_senses_verb, directed=True)

Input graph: 0 nodes, 0 edges
Updated graph: 29295 nodes, 39110 edges
Graph with orphan nodes: 29296 nodes, 39110 edges
Input graph: 0 nodes, 0 edges
Updated graph: 7408 nodes, 10317 edges
Graph with orphan nodes: 7521 nodes, 10317 edges


In [37]:
roots_noun = [v for v in G_full_noun.nodes if len(G_full_noun.nodes[v]['in_edges']) == 0]
leaves_noun = [v for v in G_full_noun.nodes if len(G_full_noun.nodes[v]['out_edges']) == 0]
roots_verb = [v for v in G_full_verb.nodes if len(G_full_verb.nodes[v]['in_edges']) == 0]
leaves_verb = [v for v in G_full_verb.nodes if len(G_full_verb.nodes[v]['out_edges']) == 0]
print('Root nodes: {} nouns, {} verbs'.format(len(roots_noun), len(roots_verb)))
print('Leaf nodes: {} nouns, {} verbs'.format(len(leaves_noun), len(leaves_verb)))

Root nodes: 9 nouns, 172 verbs
Leaf nodes: 19083 nouns, 4631 verbs


In [39]:
def get_depth(G, roots, node):
    paths = []
    for r in roots:
        try:
            paths.append(nx.shortest_path_length(G, r, node))
        except:
            pass
    return max(paths)

In [41]:
leaf_d5_noun = [v for v in leaves_noun if get_depth(G_full_dir_noun, roots_noun, v) >= 5]
leaf_d5_verb = [v for v in leaves_verb if get_depth(G_full_dir_verb, roots_verb, v) >= 5]
print('Leaf nodes of depth 5+: {} nouns, {} verbs'.format(len(leaf_d5_noun), len(leaf_d5_verb)))

Leaf nodes of depth 5+: 14649 nouns, 2357 verbs


In [42]:
# each connected component in its line
# G has to be undirected
def to_text_component(G, out_file, nodes=None, single_word=False):
    out = open(out_file, 'w')
    out.write('SYNSET_ID\tTEXT\tPARENTS\tPARENT_TEXTS\n')
    if nodes is None:
        nodes = G.nodes
    for n in nodes:
        
        if single_word:
            good_def = [txt for txt in G.nodes[n]['text'] if ' ' not in txt]
            if len(good_def) > 0:
                text = ','.join(good_def)
            else:
                continue
        else:
            text = '; '.join(G.nodes[n]['text'])
        
        parents = copy.deepcopy(G.nodes[n]['in_edges'])
        full_parents = []
        for p in parents:
            p_of_p = copy.deepcopy(G.nodes[p]['in_edges'])
            full_parents.extend([p] + p_of_p)
        if len(full_parents) == 0:
            full_parents.append('')
        full_parents = list(set(full_parents))
        
        #print(full_parents)
        subG = nx.subgraph(G, full_parents)
        for c in nx.connected_components(subG):
            parent_idx = []
            parent_txt = []
            for n_c in c:
                parent_txt.append('; '.join(G.nodes[n_c]['text']))
                parent_idx.append(n_c)
            #parent_txt = str(parent_txt).replace('\'', '\"')
            #parent_idx = str(parent_idx).replace('\'', '\"')
            out.write('%s\t%s\t%s\t%s\n' % (n, text, json.dumps(parent_idx), json.dumps(parent_txt, ensure_ascii=False)))
    out.close()

In [43]:
to_text_component(G_full_noun, 'tt_ruthes_leaf_depth5_nouns_components_semicolon2.tsv', nodes=leaf_d5_noun)
to_text_component(G_full_verb, 'tt_ruthes_leaf_depth5_verbs_components_semicolon2.tsv', nodes=leaf_d5_verb)