In [1]:
import snap
from collections import Counter
import pandas

In [2]:
import wn_utils
import snap_utils
import utils

# Discussion points
* representation WordNet (undirected vs directed)
* Semantic relations to include
* implementation:
    * introduce fake root for verbs
    * I assume for now that relation between two synsets is one-to-one

#### Representation WordNet
For now, I followed the [UKB](http://aclweb.org/anthology/J14-1003) and represent WordNet as an undirected graph, based on this sentence from the paper (Section 3.1):
```
We chose to use undirected relations between concepts, because most of the relations are symmetric
and have their inverse counterpart (cf. Section 3), and in preliminary work we failed to
see any effect using directed relations.
```

In [3]:
lemmapos2mfs_offset = wn_utils.get_lemmapos2mfs_offset()
the_candidates = wn_utils.get_candidates(lemmapos2mfs_offset)

In [4]:
combinations = [
    {'hyponym_hypernym'},
    {'meronym_holonym'},
    {'others'},
    {'hyponym_hypernym', 'meronym_holonym', 'others'}
]
perc = True 
keys = [1, 2, 3, '4>']

lists_of_lists = []
headers = ['categories', '#nodes', '#edges', '#lemma_pos', 1, 2, 3, '4>']

for categories in combinations:
    edges = wn_utils.load_wn_edges(categories=categories)
    num_edges = len(edges)
    
    mfs_pr_values, num_nodes = snap_utils.distr_mfs_pr_rank(edges,
                                                            lemmapos2mfs_offset,
                                                            the_candidates)
    num_lemma_pos = len(mfs_pr_values)
    
    table_mfs_pr_values = [number if number <= 3 else '4>'
                           for number in mfs_pr_values]
    distribution = Counter(table_mfs_pr_values)
    values = [distribution[key] for key in keys]

    if perc:
        total = sum(distribution.values())
        values = [round((100 * (float(value) / float(total))), 2)
                  for value in values]
    
    one_row = [categories, num_nodes, num_edges, num_lemma_pos] + values
    lists_of_lists.append(one_row)

In [5]:
df = pandas.DataFrame(lists_of_lists, columns=headers)

## Relation Pagerank and the most frequent sense
WordNet defines for each lemma, pos combination, e.g. the **noun horse** a dominant sense, 
which is partly determined by corpora frequency and partially by manually.

The table below shows how the rank according to senseranks in WordNet correlate with the rank according to PageRank.
The meaning of each column is as follows:
* **categories**: the relations used to build the undirected graph.
* **#nodes**: number of nodes in the graph (nodes are only in the graph if there are linked with a relation from categories)
* **#edges**: number of edges in the graph (nodes are only in the graph if there are linked with a relation from categories)
* **1**: for all non-monosemous lemma, pos combinations, the % for which the MFS is ranked **1st according to Pagerank**
* **2**: for all non-monosemous lemma, pos combinations, the % for which the MFS is ranked **2nd according to Pagerank**
* **3**: for all non-monosemous lemma, pos combinations, the % for which the MFS is ranked **3rd according to Pagerank**
* **4**: for all non-monosemous lemma, pos combinations, the % for which the MFS is ranked **4th or higher according to Pagerank**


In [6]:
df

Unnamed: 0,categories,#nodes,#edges,#lemma_pos,1,2,3,4>
0,{hyponym_hypernym},95585,97666,17730,47.81,36.23,9.46,6.5
1,{meronym_holonym},23153,22187,17730,61.18,24.46,7.2,7.16
2,{others},15976,14247,17730,65.35,20.76,7.23,6.66
3,"{meronym_holonym, hyponym_hypernym, others}",108908,134076,17730,45.66,39.1,9.35,5.89
