## Lemma to frame relationship

In [18]:
import stats_utils
from collections import defaultdict, Counter
import pandas
import operator

In [19]:
fn = stats_utils.load_framenet(version='1.7')

In [20]:
with_pos = True 

lemma2frames = defaultdict(set)
frame2lemmas = defaultdict(set)

for frame in fn.frames():
    frame_label = frame.name
    for lu in frame.lexUnit.keys():
        lemma, pos = lu.split('.')
        
        if with_pos:
            lemma2frames[(lemma, pos)].add(frame_label)
            frame2lemmas[frame_label].add((lemma, pos))
        else:
            lemma2frames[lemma].add(frame_label)
            frame2lemmas[frame_label].add(lemma)

In [4]:
fn_polysemy = [len(value) for value in lemma2frames.values()]

In [5]:
for lemma, frames in lemma2frames.items():
    if len(frames) == 11:
        print(lemma, frames)

('hit', 'v') {'Cause_motion', 'Experience_bodily_harm', 'Eventive_affecting', 'Hit_or_miss', 'Impact', 'Cognitive_impact', 'Attack', 'Cause_harm', 'Arriving', 'Cause_impact', 'Hit_target'}
('strike', 'v') {'Coming_to_believe', 'Erasing', 'Eventive_affecting', 'Impact', 'Cognitive_impact', 'Attack', 'Cause_harm', 'Cause_impact', 'Light_movement', 'Political_actions', 'Be_in_agreement_on_action'}


In [6]:
max(fn_polysemy)

11

In [7]:
average_polysemy = sum(fn_polysemy) / len(fn_polysemy)
print(len(lemma2frames), average_polysemy)

10462 1.297266297075129


In [8]:
Counter(fn_polysemy)

Counter({3: 372,
         7: 15,
         1: 8475,
         4: 125,
         2: 1371,
         6: 24,
         5: 57,
         9: 7,
         8: 10,
         10: 4,
         11: 2})

In [9]:
distribution = [pos 
                for lemma, pos in lemma2frames]
counts = Counter(distribution)

In [10]:
lists_of_lists = []
headers = ['Part of speech', 'Framenet 1.7', 'PropBank 3.1']

total = sum(counts.values())

for pos, freq in sorted(counts.items(), 
                        key=operator.itemgetter(1),
                        reverse=True):
    
    perc = 100 * (freq / total)
    value = f'{round(perc, 2)}% ({freq})'
    
    if pos == 'v':
        one_row = [pos, value, '100% (7,311)']
    else:
        one_row = [pos, value, '-']
        
    lists_of_lists.append(one_row)

df = pandas.DataFrame(lists_of_lists, columns=headers)
print(df.to_latex(index=False))


\begin{tabular}{lll}
\toprule
Part of speech &   Framenet 1.7 &  PropBank 3.1 \\
\midrule
             n &  44.87\% (4694) &             - \\
             v &  31.71\% (3318) &  100\% (7,311) \\
             a &  19.52\% (2042) &             - \\
           adv &     2.1\% (220) &             - \\
          prep &     0.95\% (99) &             - \\
           num &      0.3\% (31) &             - \\
          idio &     0.28\% (29) &             - \\
          scon &     0.11\% (12) &             - \\
           art &      0.06\% (6) &             - \\
          intj &      0.05\% (5) &             - \\
             c &      0.05\% (5) &             - \\
          pron &      0.01\% (1) &             - \\
\bottomrule
\end{tabular}



In [11]:
variance = [len(value) for value in frame2lemmas.values()]
counts = Counter(variance)

In [12]:
lists_of_lists = []
headers = ['Variance class', 'Framenet 1.7', 'PropBank 3.1']

total = sum(counts.values())

for freq_class, freq in sorted(counts.items(), 
                        key=operator.itemgetter(1),
                        reverse=True):
    
    perc = 100 * (freq / total)
    value = f'{round(perc, 2)}% ({freq})'
    
    if freq_class == 1:
        one_row = [freq_class, value, '100% (10,672)']
    else:
        one_row = [freq_class, value, '-']
        
    lists_of_lists.append(one_row)

df = pandas.DataFrame(lists_of_lists, columns=headers)
print(df.to_latex(index=False))

\begin{tabular}{rll}
\toprule
 Variance class & Framenet 1.7 &   PropBank 3.1 \\
\midrule
              3 &  11.0\% (118) &              - \\
              2 &  10.9\% (117) &              - \\
              4 &   8.01\% (86) &              - \\
              5 &   7.36\% (79) &              - \\
              6 &   6.71\% (72) &              - \\
              1 &   5.59\% (60) &  100\% (10,672) \\
              7 &   4.75\% (51) &              - \\
              8 &   4.29\% (46) &              - \\
              9 &   3.73\% (40) &              - \\
             10 &   3.63\% (39) &              - \\
             12 &   3.26\% (35) &              - \\
             11 &    2.8\% (30) &              - \\
             13 &    2.7\% (29) &              - \\
             14 &   2.52\% (27) &              - \\
             16 &   2.14\% (23) &              - \\
             15 &   1.49\% (16) &              - \\
             18 &   1.21\% (13) &              - \\
             26 &   1.21\