In [18]:
import pandas as pd
import numpy as np

In [19]:
%matplotlib inline
import matplotlib.pyplot as plt

In [20]:
from stats import children_per_branch
from conll import parse_tree_conll

In [21]:
data = pd.read_csv('data/FTB_rel_stats.csv')
data.head()

Unnamed: 0,#,GOV,REL,DEP,STATS
0,1,NOUN,acl,VERB,5819; 80% instances
1,2,PRON,acl,VERB,317; 4% instances
2,3,NOUN,acl,ADJ,275; 4% instances
3,4,PROPN,acl,VERB,239; 3% instances
4,5,ADJ,acl,VERB,183; 3% instances


In [22]:
# Split and clean the STATS column
stats = data['STATS'].str.split(' ', expand=True)
stats[0] = stats[0].apply(lambda x: x.rstrip(';'))
stats[1] = stats[1].apply(lambda x: x.rstrip('%'))

In [23]:
# Replace STATS with the splitted data
data['INSTANCES_N'] = stats[0].astype(int)
data['INSTANCES_%'] = data.groupby('REL')['INSTANCES_N'].apply(lambda x: (x / x.sum()*100))
data.drop('STATS', axis=1, inplace=True)
data.head()

Unnamed: 0,#,GOV,REL,DEP,INSTANCES_N,INSTANCES_%
0,1,NOUN,acl,VERB,5819,80.228871
1,2,PRON,acl,VERB,317,4.370605
2,3,NOUN,acl,ADJ,275,3.791535
3,4,PROPN,acl,VERB,239,3.295188
4,5,ADJ,acl,VERB,183,2.523094


In [176]:
# Some statistics per relation
data.groupby('REL').groups.keys() # Return the groups' names
data.groupby('REL').first() # Get the first entry for each group = most common POS pair
data.groupby('REL').nth(list(range(3))) # Get the first n entries for each group
data.groupby('REL')['INSTANCES_N'].sum() # Get the sum of the instances per relation
data.groupby('REL')['INSTANCES_N'].count().sort_values(ascending=False) # Get the number of POS pairs (head:dep) per relation
data.groupby('REL')[['INSTANCES_N']].count().sort_values('INSTANCES_N', ascending=False) # Same, but returns a Dataframe instead of a Series
data.groupby('REL')['INSTANCES_N'].agg([np.sum, np.mean, np.std]).head()

Unnamed: 0_level_0,INSTANCES_N
REL,Unnamed: 1_level_1
fixed,132
conj,105
dep,94
nmod,85
advmod,78
nsubj,71
obl,69
orphan,60
cc,54
mark,53


### Children per branch

In [28]:
path = "data/fr_ftb-ud-dev.conllu"
children = children_per_branch(parse_tree_conll(path))

In [66]:
ch = pd.DataFrame(children).reset_index()
ch['index'] = ch['index'].astype(str).astype(int)
# Add missing rows for children values not observed in the corpus
new_index = pd.Index(range(ch['index'].max()+1))
ch = ch.set_index('index').reindex(new_index)
# Transform and rename index
ch = ch.T.fillna(0)
ch.index.name = 'relations'
ch.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
relations,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
acl,52.0,213.0,130.0,60.0,25.0,17.0,5.0,4.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
acl:relcl,0.0,1.0,69.0,114.0,71.0,33.0,23.0,14.0,8.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
advcl,0.0,10.0,93.0,101.0,62.0,32.0,23.0,17.0,9.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
advmod,1150.0,297.0,98.0,36.0,2.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
amod,1438.0,167.0,38.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
ch[0]

In [155]:
# Number of POS pairs (head:dep) for each relation. 
data['REL'].value_counts(normalize=False) # Set normalize to True to get relative frequences

fixed         132
conj          105
dep            94
nmod           85
advmod         78
nsubj          71
obl            69
orphan         60
cc             54
mark           53
acl:relcl      47
acl            47
case           40
advcl          40
obj            37
parataxis      37
xcomp          34
det            29
amod           27
ccomp          23
punct          22
iobj           17
nummod         15
root           15
flat:name      15
expl           12
cop            12
csubj           9
aux             9
appos           6
aux:pass        4
nsubj:caus      4
dislocated      1
flat            1
aux:caus        1
vocative        1
Name: REL, dtype: int64