In [1]:
import pandas as pd

## Helper functions

The following function does a pre-processing of the file in `.cupt` format and convert the data into a `pandas DataFrame`. Each line of the resulting dataframe is a token from a sentence in the original dataset. Each line also contains the `id` of the original sentence so we can keep track of which line belons to which sentence.

In [2]:
def process_cup(text):

    id_sent = None
    features = []

    for line in text:

        if line is '\n':
            id_sent = None
        elif line.startswith('# source_sent_id'):
            tokens = line.split()
            id_sent = tokens[-1]
        elif not line.startswith('#'):
            feats = line.split()
            feats_dict = {
                'id_sent': id_sent,
                'id': feats[0],
                'form': feats[1],
                'lemma': feats[2],
                'upos': feats[3],
                'xpos': feats[4],
                'feats': feats[5],
                'head': feats[6],
                'deprel': feats[7],
                'deps': feats[8],
                'misc': feats[9],
                'mwe': feats[10]
            }
            features.append(feats_dict)

    return pd.DataFrame(features)

The following function extracts the individual labels from the sentences from files in in the `c.upt` format into a `python list`. 

For example:

|- `'1:IRV'` yield `['IRV']`

|- `'1:IRV;2:VPC.full'` yield `['IRV', 'VPC.full']`

|- `'1;VPC.full'` yield `['1', 'VPC.full']`

|- `'1;2'` yield `['1', '2']`

|- `'*'` yield `['*']`

In [3]:
def proc_label(string):
    if string is '*':
        return [string]
    
    labels = []
    if ';' in string and ':' in string:
        label = string.split(';')
        for l in label:            
            labels.append(l.split(':')[-1])
    
    elif ';' in string and ':' not in string:
        l = string.split(';')
        if l[-1] in ['1', '2', '3', '4', '5']:
            labels += [i for i in l]
    
    elif ';' not in string and ':' in string:
        labels.append(string.split(':')[-1])
    
    elif string in ['1', '2', '3', '4', '5']:
        return [string]
    
    return labels

This function will build the dataset with the respective labels.

In [4]:
def build_dataset(text):
    examples = []
    flag = False
    example = ''
    for line in text:
        if line is '\n':     # if it is an empty line, we reset everything
            label = 1 if flag else 0
            examples.append((example.strip(), label))
            example = ''
            flag = False

        elif not line.startswith('#'):     # if it is not a line of metadata
            feats = line.split()
            example += ' ' + feats[3]
            if feats[10] is not '*' and flag == False:
                flag = True

    return examples

In [5]:
def build_per_word_dataset(text):
    examples = []
    example = ''
    labels = ''
    for line in text:
        if line is '\n':     # if it is an empty line, we reset everything            
            examples.append((example.strip(), labels.strip()))
            example = ''
            labels = ''

        elif not line.startswith('#'):     # if it is not a line of metadata
            feats = line.split()
            example += ' ' + feats[3]
            label = ' 0' if feats[10] is '*' else ' 1'
            labels +=  label

    return examples

## German

In [6]:
german = process_cup(open('data/DE/train.cupt'))
german.head(20)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
0,newscrawl-2153,1,Peter,Peter,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,23,nsubj,_,_,*
1,newscrawl-2153,2,Hermann,Hermann,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,SpaceAfter=No,*
2,newscrawl-2153,3,",",",",PUNCT,"$,",_,4,punct,_,_,*
3,newscrawl-2153,4,Heynckes,Heynckes,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,6,nmod,_,_,*
4,newscrawl-2153,5,',',PUNCT,$(,_,4,punct,_,_,*
5,newscrawl-2153,6,Co-Trainer,Co-Trainer,NOUN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
6,newscrawl-2153,7,beim,beim,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
7,newscrawl-2153,8,FC,FC,PROPN,NN,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
8,newscrawl-2153,9,Bayern,Bayern,PROPN,NE,Case=Nom|Gender=Neut|Number=Sing,1,flat,_,SpaceAfter=No,*
9,newscrawl-2153,10,",",",",PUNCT,"$,",_,1,punct,_,_,*


In [6]:
german.groupby('mwe').size()

mwe
*                        121299
1                          2235
1:IRV                       198
1:IRV;2:LVC.full              1
1:IRV;2:VPC.full             17
1:LVC.cause                  20
1:LVC.full                  180
1:LVC.full;2:VPC.full         3
1:VID                       873
1:VID;2:VID                   2
1:VID;2:VPC.full             18
1:VPC.full                 1034
1:VPC.semi                  100
1;2                          47
1;2:IRV                       1
1;2:VID                       4
1;2:VPC.full                 27
1;2:VPC.semi                  8
1;3                           1
1;3:IRV                       1
2                           300
2:IRV                        25
2:IRV;3:VPC.full              1
2:LVC.cause                   4
2:LVC.full                   26
2:VID                       122
2:VID;3:VPC.full              1
2:VPC.full                  155
2:VPC.semi                   26
2;3                           4
2;3:VID                       3
2;3:

## Irish

In [7]:
irish = process_cup(open('data/GA/train.cupt'))
irish.head(20)

(' VERB ADP NOUN PART VERB NOUN SCONJ ADP PRON ADJ CCONJ DET NOUN DET NOUN ADP NOUN NOUN PART NOUN PUNCT',
 1)

(' VERB ADP NOUN PART VERB NOUN SCONJ ADP PRON ADJ CCONJ DET NOUN DET NOUN ADP NOUN NOUN PART NOUN PUNCT',
 1)

In [8]:
irish.groupby('mwe').size()

mwe
*                        6007
1                         105
1:IAV                      15
1:LVC.cause                16
1:LVC.full                 26
1:LVC.full;2:LVC.full       1
1:VID                      13
1:VPC.full                  1
1:VPC.semi                  1
2                          25
2:IAV                       8
2:LVC.cause                 5
2:LVC.full                  6
2:VID                       1
2:VPC.full                  1
2:VPC.semi                  1
3                           6
3:IAV                       1
3:LVC.cause                 2
3:LVC.full                  1
dtype: int64

## Hindi

In [9]:
hindi = process_cup(open('data/HI/train.cupt'))
hindi.head(20)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
0,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,1,दूसरी,दूसरा,ADJ,QO,Case=Nom|Gender=Fem|Number=Sing|NumType=Ord,2,amod,_,_,*
1,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,2,तरफ,तरफ,ADV,NST,AdpType=Post|Case=Nom|Gender=Fem|Number=Sing|P...,9,obl,_,_,*
2,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,3,कश्मीर,कश्मीर,PROPN,NNP,Case=Acc|Gender=Masc|Number=Sing|Person=3,9,obl,_,_,*
3,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,4,में,में,ADP,PSP,AdpType=Post,3,case,_,_,*
4,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,5,मुस्लिमों,मुस्लिम,NOUN,NN,Case=Acc|Gender=Masc|Number=Plur|Person=3,7,nmod,_,_,*
5,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,6,की,का,ADP,PSP,AdpType=Post|Case=Nom|Gender=Fem|Number=Sing,5,case,_,_,*
6,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,7,संख्या,संख्या,NOUN,NN,Case=Nom|Gender=Fem|Number=Sing|Person=3,9,nsubj,_,_,*
7,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,8,ज्यादा,ज्यादा,DET,QF,PronType=Ind,9,compound,_,_,*
8,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,9,होने,हो,VERB,VM,Case=Acc|VerbForm=Inf,13,advcl,_,_,*
9,autogen--hi-ud-test-withLVCs-1-500_ab.parsemet...,10,के,के,ADP,PSP,AdpType=Post,9,mark,_,_,*


In [10]:
hindi.groupby('mwe').size()

mwe
*               5407
1                145
1:LVC.cause        2
1:LVC.full        84
1:MVC             40
1:VID             11
1;2:LVC.full       1
1;2:MVC            6
2                 31
2:LVC.cause        1
2:LVC.full        22
2:MVC              2
2;3:MVC            3
3                  5
3:LVC.full         1
3:MVC              1
4                  1
4:LVC.full         1
dtype: int64

## Brazilian-Portuguese

In [11]:
portuguese = process_cup(open('data/PT/train.cupt'))
portuguese.head(20)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
0,diario_gaucho_9315,1,Iarley,Iarley,PROPN,_,Gender=Masc|Number=Sing,12,nsubj,_,_,*
1,diario_gaucho_9315,2,Goleador,Goleador,PROPN,_,Number=Sing,1,flat:name,_,_,*
2,diario_gaucho_9315,3-4,do,_,_,_,_,_,_,_,_,*
3,diario_gaucho_9315,3,de,de,ADP,_,_,5,case,_,_,*
4,diario_gaucho_9315,4,o,o,DET,_,Definite=Def|Gender=Masc|Number=Sing|PronType=Art,5,det,_,_,*
5,diario_gaucho_9315,5,Internacional,Internacional,PROPN,_,Gender=Masc|Number=Sing,1,nmod,_,_,*
6,diario_gaucho_9315,6,e,e,CCONJ,_,_,9,cc,_,_,*
7,diario_gaucho_9315,7-8,do,_,_,_,_,_,_,_,_,*
8,diario_gaucho_9315,7,de,de,ADP,_,_,9,case,_,_,*
9,diario_gaucho_9315,8,o,o,DET,_,Definite=Def|Gender=Masc|Number=Sing|PronType=Art,9,det,_,_,*


In [12]:
portuguese.groupby('mwe').size()

mwe
*                                              532044
1                                                5104
1:IRV                                             690
1:IRV;2:VID                                         4
1:LVC.cause                                        83
1:LVC.cause;2:LVC.cause;3:LVC.cause                 1
1:LVC.full                                       2582
1:LVC.full;2:LVC.full                              47
1:LVC.full;2:LVC.full;3:LVC.full                    2
1:LVC.full;2:LVC.full;3:LVC.full;4:LVC.full         1
1:LVC.full;2:VID                                    1
1:MVC                                              11
1:VID                                             827
1:VID;2:LVC.full                                    1
1:VID;2:VID                                         3
1;2                                                31
1;2:IRV                                             2
1;2:LVC.full                                       10
1;2:VID                 

## Chinese

In [13]:
chinese = process_cup(open('data/ZH/train.cupt'))
chinese.head(20)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
0,conll2017-crawl-000-28464,1,其病,其病,NOUN,NN,_,3,nsubj,_,_,*
1,conll2017-crawl-000-28464,2,搖動,搖動,ADV,RB,_,3,advmod,_,_,*
2,conll2017-crawl-000-28464,3,注恐,注恐,VERB,VV,_,0,root,_,_,*
3,conll2017-crawl-000-28464,4,",",",",PUNCT,",",_,9,punct,_,_,*
4,conll2017-crawl-000-28464,5,從,從,ADP,IN,_,6,case,_,_,*
5,conll2017-crawl-000-28464,6,金化,金化,NOUN,NN,_,9,nmod,_,_,*
6,conll2017-crawl-000-28464,7,也,也,ADV,RB,_,9,mark,_,_,*
7,conll2017-crawl-000-28464,8,",",",",PUNCT,",",_,9,punct,_,_,*
8,conll2017-crawl-000-28464,9,少角,少角,NOUN,NN,_,3,obj,_,_,*
9,conll2017-crawl-000-28464,10,與,與,CCONJ,CC,_,11,cc,_,_,*


In [14]:
chinese.groupby('mwe').size()

mwe
*                    563229
1                      3490
1:LVC.cause              90
1:LVC.cause;2:MVC         1
1:LVC.full              689
                      ...  
5:VPC.semi                5
6                         1
6:VPC.semi                3
7                         1
7:VPC.semi                1
Length: 61, dtype: int64

## The resulting dataset

In [15]:
df = pd.concat([german, irish, hindi, portuguese, chinese])
df.head(50)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
0,newscrawl-2153,1,Peter,Peter,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,23,nsubj,_,_,*
1,newscrawl-2153,2,Hermann,Hermann,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,SpaceAfter=No,*
2,newscrawl-2153,3,",",",",PUNCT,"$,",_,4,punct,_,_,*
3,newscrawl-2153,4,Heynckes,Heynckes,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,6,nmod,_,_,*
4,newscrawl-2153,5,',',PUNCT,$(,_,4,punct,_,_,*
5,newscrawl-2153,6,Co-Trainer,Co-Trainer,NOUN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
6,newscrawl-2153,7,beim,beim,PROPN,NE,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
7,newscrawl-2153,8,FC,FC,PROPN,NN,Case=Nom|Gender=Masc|Number=Sing,1,flat,_,_,*
8,newscrawl-2153,9,Bayern,Bayern,PROPN,NE,Case=Nom|Gender=Neut|Number=Sing,1,flat,_,SpaceAfter=No,*
9,newscrawl-2153,10,",",",",PUNCT,"$,",_,1,punct,_,_,*


In [16]:
df.tail(50)

Unnamed: 0,id_sent,id,form,lemma,upos,xpos,feats,head,deprel,deps,misc,mwe
575540,conll2017-crawl-000-10080,16,什麼,什麼,PRON,WP,_,0,root,_,_,*
575541,conll2017-crawl-000-10080,17,?,?,PUNCT,",",_,16,punct,_,_,*
575542,conll2017-crawl-000-10719,1,Hot,Hot,X,FW,_,10,nsubj,_,_,*
575543,conll2017-crawl-000-10719,2,Hill,Hill,X,FW,_,1,flat:foreign,_,_,*
575544,conll2017-crawl-000-10719,3,(,(,PUNCT,",",_,10,punct,_,_,*
575545,conll2017-crawl-000-10719,4,提納卡納,提納卡納,VERB,VV,_,10,advcl,_,_,*
575546,conll2017-crawl-000-10719,5,),),PUNCT,",",_,7,punct,_,_,*
575547,conll2017-crawl-000-10719,6,-,-,PUNCT,HYPH,_,7,punct,_,_,*
575548,conll2017-crawl-000-10719,7,餐廳,餐廳,NOUN,NN,_,10,conj,_,_,*
575549,conll2017-crawl-000-10719,8,/,/,SYM,/,_,10,punct,_,_,*


### Statistics

The number of unique sentences in the resulting dataset:

In [17]:
len(df['id_sent'].unique())

65555

In [18]:
(df.groupby('id_sent').size() == 1).sum()

224

Number of unique (universal) parts-of-speech in the dataset:

In [19]:
print(len(df['upos'].unique()))
df['upos'].unique()

18


array(['PROPN', 'PUNCT', 'NOUN', 'AUX', 'ADP', 'DET', 'ADJ', 'VERB',
       'PRON', 'NUM', 'PART', 'ADV', '_', 'SCONJ', 'CCONJ', 'SYM', 'X',
       'INTJ'], dtype=object)

Frequency of each (universal) part-of-speech in the dataset.

In [20]:
df.groupby('upos').size()

upos
ADJ       43527
ADP      109526
ADV       49074
AUX       29791
CCONJ     23586
DET       96013
INTJ        121
NOUN     275883
NUM       50371
PART      57361
PRON      29616
PROPN    104982
PUNCT    173710
SCONJ      7531
SYM        5866
VERB     135292
X         28789
_         35884
dtype: int64

This pre-processing step takes a while. I'm not aware of a better/more elegant way of doing it.

In [21]:
files = [
    'DE/train.cupt',
    'GA/train.cupt',
    'HI/train.cupt',
    'PT/train.cupt',
    'ZH/train.cupt'
]

data = []

for file in files:
    data += build_dataset(open('data/' + file))


In [22]:
len(data)

66338

In [23]:
print(german.groupby('id_sent').size())
print(len(german['id_sent'].unique()))


gdata = build_dataset(open('data/' + files[0]))
print(len(gdata))


id_sent
newscrawl-1       22
newscrawl-10      11
newscrawl-1000    10
newscrawl-1001    22
newscrawl-1002    22
                  ..
train-s2995       21
train-s2996       25
train-s2997       14
train-s2998       24
train-s3000       12
Length: 6568, dtype: int64
6568
6568


In [24]:
print(irish.groupby('id_sent').size())
print(len(irish['id_sent'].unique()))


idata = build_dataset(open('data/' + files[1]))
print(len(idata))


id_sent
1006    16
1009    41
1022    10
1041    16
1042    10
        ..
985     19
989     16
99      20
992     37
995     17
Length: 257, dtype: int64
257
257


In [25]:
print(hindi.groupby('id_sent').size())
print(len(hindi['id_sent'].unique()))


hdata = build_dataset(open('data/' + files[2]))
print(len(hdata))

id_sent
autogen--hi-ud-test-withLVCs-1-500_ab.parsemetsv-pos.folia.xml--104    12
autogen--hi-ud-test-withLVCs-1-500_ab.parsemetsv-pos.folia.xml--106    11
autogen--hi-ud-test-withLVCs-1-500_ab.parsemetsv-pos.folia.xml--114    15
autogen--hi-ud-test-withLVCs-1-500_ab.parsemetsv-pos.folia.xml--12     13
autogen--hi-ud-test-withLVCs-1-500_ab.parsemetsv-pos.folia.xml--124    29
                                                                       ..
autogen--hi-ud-test-withLVCs501-1000_ab.folia.xml--86                  19
autogen--hi-ud-test-withLVCs501-1000_ab.folia.xml--87                  18
autogen--hi-ud-test-withLVCs501-1000_ab.folia.xml--88                   9
autogen--hi-ud-test-withLVCs501-1000_ab.folia.xml--97                  17
autogen--hi-ud-test-withLVCs501-1000_ab.folia.xml--99                  21
Length: 282, dtype: int64
282
282


In [26]:
print(portuguese.groupby('id_sent').size())
print(len(portuguese['id_sent'].unique()))


pdata = build_dataset(open('data/' + files[3]))
print(len(pdata))

id_sent
CF1-1          5
CF1-4          8
CF1-6         13
CF1-7         14
CF1-8         19
              ..
train-s995    11
train-s996    48
train-s997    38
train-s998    25
train-s999    12
Length: 23905, dtype: int64
23905
23905


In [27]:
print(chinese.groupby('id_sent').size())
print(len(chinese['id_sent'].unique()))


cdata = build_dataset(open('data/' + files[4]))
print(len(cdata))

id_sent
conll2017-crawl-000-0       16
conll2017-crawl-000-1        8
conll2017-crawl-000-10       7
conll2017-crawl-000-100      6
conll2017-crawl-000-1000     5
                            ..
ud-pud-test-994             47
ud-pud-test-995             41
ud-pud-test-996             19
ud-pud-test-997             23
ud-pud-test-999             35
Length: 35326, dtype: int64
35326
35326


In [28]:
cdata[0]

('NOUN ADV VERB PUNCT ADP NOUN ADV PUNCT NOUN CCONJ NOUN ADP PUNCT NOUN CCONJ PART NOUN ADP PUNCT NOUN CCONJ PART NOUN ADP PUNCT',
 0)

In [38]:
idata = build_per_word_dataset(open('data/' + files[1]))
print(len(idata))

257


In [39]:
idata[0]

('VERB ADP NOUN PART VERB NOUN SCONJ ADP PRON ADJ CCONJ DET NOUN DET NOUN ADP NOUN NOUN PART NOUN PUNCT',
 '1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')