# 1. IMPORTS

In [1]:
#data_propbankbr lives within datasets
import sys
sys.path.append('../datasets/')
sys.path.append('../datasets/scripts')

import re
import numpy as np
import pandas as pd 
from scripts.propbankbr import propbankbr_parser

REQUIRED_COLUMNS = ['ID', 'S', 'P', 'P_S', 'FORM', 'LEMMA', 'GPOS', 'MORF', 'CTREE', 'PRED', 'ARG0']

In [2]:
# This dataframe has all multiple arguments on columns ARG0 ~ARG6
df = propbankbr_parser('1.0')
print(df.shape)
df.head()


(69122, 17)


Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*,*,*,*,,,
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*,*,*,*,,,
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*,*,*,*,,,
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*),*,*,*,,,
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*),*,*,*,,,


In [3]:
df.tail()

Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
69117,31,3308,5776,2,de,de,PRP,-,(PP*,-,*,*,,,,,
69118,32,3308,5776,2,J.R.Duran,J.R.Duran,PROP,M|S,(NP*,-,*,*,,,,,
69119,33,3308,5776,2,",",-,PU,-,*,-,*,*,,,,,
69120,34,3308,5776,2,Alexandra_Brochen,Alexandra_Brochen,PROP,F|S,(NP*)))),-,*,*,,,,,
69121,35,3308,5776,2,.,-,PU,-,*),-,*,*,,,,,


In [4]:
for label in REQUIRED_COLUMNS:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        print('there are missing values on required fields')


In [6]:
# dftest = df[ df['FORM'] == 'grevismo']
# P in (870, 871, 872)
dftest = df[ df['S'] == 3165]
# dftest.loc[21486:21516,:].head()
dftest.head(30)

Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
66393,1,3165,5537,0,«,-,PU,-,(FCL(FCL*,-,*,,,,,,
66394,2,3165,5537,0,Se,se,CONJ-S,-,*,-,*,,,,,,
66395,3,3165,5537,0,for,ser,V-FIN,FUT|3S|SUBJ,(VP*),-,*,,,,,,
66396,4,3165,5537,0,uma,um,ART,F|S,(NP*,-,(A0*,,,,,,
66397,5,3165,5537,0,greve,greve,N,F|S,*,-,*,,,,,,
66398,6,3165,5537,0,só,só,ADV,-,(PP*,-,*,,,,,,
66399,7,3165,5537,0,por,por,PRP,-,*,-,*,,,,,,
66400,8,3165,5537,0,grevismo,grevismo,N,M|S,(NP*)))),-,*),,,,,,
66401,9,3165,5537,0,",",-,PU,-,*,-,*,,,,,,
66402,10,3165,5537,0,de,de,PRP,-,(ADVP*,-,*,,,,,,


 ### 1. 1 Normalization
Convert each record into a machine learning example i.e one argument per row
* Filter        .: each ARG column
* Create        .: tmp row holding the number of the argument ( for further ordering )
* Concatenate   .: everybody afterwards by rows
* Sort          .: reorder by S and TMP


In [7]:
def unstack_df(df):
    '''
        Unstack outputs
        args:
            df :DataFrame ConLL flat tree format
        returns:
            df :DataFrame
    '''
    column_arg0 = list(df.columns).index('ARG0')
    columns_features= df.columns[:column_arg0]
    df_feature= df.loc[:,columns_features].copy()

    dataframes=[]
    num_records=0
    for i in range(7):
        col= 'ARG{:}'.format(i)
        df_target= df[col].dropna().to_frame()
        df_arg = df_feature.join(df_target, how='right')
        df_arg = df_arg.rename(index=str, columns={col: 'ARG'})
        df_arg['TMP']=i+1
    
        num_records+=df_arg.shape[0]
        print('{:}-\tnew records: {:}\ttotal records: {:}'.format(i, df_arg.shape[0],num_records))
        dataframes.append(df_arg)

    #Concatente, sort by sentence & argument, reindex
    df = pd.concat(dataframes,axis=0)
    df.sort_values(by=['S','TMP'],axis=0 ,kind='mergesort', inplace=True)    
    df = df.reset_index(drop=True)
    df.index.names=['INDEX']
    return df

In [8]:
df = unstack_df(df)


0-	new records: 69122	total records: 69122
1-	new records: 40975	total records: 110097
2-	new records: 18598	total records: 128695
3-	new records: 6679	total records: 135374
4-	new records: 2254	total records: 137628
5-	new records: 572	total records: 138200
6-	new records: 178	total records: 138378


In [9]:
df.head()

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG,TMP
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*,1
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*,1
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*,1
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*),1
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*),1


In [10]:
test_columns = list(REQUIRED_COLUMNS)
test_columns[-1] = 'ARG'
test_columns.append('TMP')
for label in test_columns:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        print('there are missing values on required fields')

In [11]:
# dftest = df[ df['FORM'] == 'no_entanto']
# P in (870, 871, 872)
dftest = df[ df['S'] == 472]
# dftest.loc[21486:21516,:].head()
dftest.head(30)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG,TMP
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
21491,1,472,870,1,Há,haver,V-FIN,PR|3S|IND,(FCL(VP*),haver,(V*),1
21492,2,472,870,1,uma,um,ART,F|S,(NP-*,-,(A1*,1
21493,3,472,870,1,pequena,pequeno,ADJ,F|S,*,-,*,1
21494,4,472,870,1,parcela,parcela,N,F|S,*),-,*),1
21495,5,472,870,1,",",-,PU,-,*,-,*,1
21496,6,472,870,1,no_entanto,no_entanto,ADV,-,(ADVP*),-,(AM-DIS*),1
21497,7,472,870,1,",",-,PU,-,*,-,*,1
21498,8,472,870,1,que,que,PRON-INDP,F|S,(-NP(FCL(NP*),-,(C-A1*,1
21499,9,472,870,1,é,ser,V-FIN,PR|3S|IND,(VP*),-,*,1
21500,10,472,870,1,catastrofista,catastrofista,ADJ,F|S,(ADJP*),-,*,1


 ### 1.2 FIXING EXISTING FEATURES 
 #### 1.2.1 FIXING PREDICATE
  * __PRED__ must be non empty only when ARG is (V*) 
  * Drop column __TMP__

In [12]:
def fix_pred(df):
    index= (df['TMP'] == df['P_S'])
    df.loc[~index,'PRED']= '-'
    df = df.drop(labels='TMP', axis=1)
    return df

In [13]:
#Fix predicate 
df = fix_pred(df)
df.head(30)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*)
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*)
5,6,1,1,1,um,um,ART,M|S,(NP*,-,(A1*
6,7,1,1,1,dado,dado,N,M|S,*,-,*
7,8,1,1,1,supreendente,surpreendente,ADJ,M|S,(ADJP*),-,*
8,9,1,1,1,:,-,PU,-,*,-,*
9,10,1,2,2,recusando,recusar,V-GER,-,(FCL(ICL(VP*),-,*


In [14]:
# dftest = df[ df['FORM'] == 'no_entanto']
# P in (870, 871, 872)
dftest = df[ df['S'] == 472]
# dftest.loc[21486:21516,:].head()
dftest.head(30)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
21491,1,472,870,1,Há,haver,V-FIN,PR|3S|IND,(FCL(VP*),haver,(V*)
21492,2,472,870,1,uma,um,ART,F|S,(NP-*,-,(A1*
21493,3,472,870,1,pequena,pequeno,ADJ,F|S,*,-,*
21494,4,472,870,1,parcela,parcela,N,F|S,*),-,*)
21495,5,472,870,1,",",-,PU,-,*,-,*
21496,6,472,870,1,no_entanto,no_entanto,ADV,-,(ADVP*),-,(AM-DIS*)
21497,7,472,870,1,",",-,PU,-,*,-,*
21498,8,472,870,1,que,que,PRON-INDP,F|S,(-NP(FCL(NP*),-,(C-A1*
21499,9,472,870,1,é,ser,V-FIN,PR|3S|IND,(VP*),-,*
21500,10,472,870,1,catastrofista,catastrofista,ADJ,F|S,(ADJP*),-,*


 #### 1.2.2 FIXING P
  __P__ will be a range like id over the predicates.

In [15]:
def fix_p(df):
    P=[]
    P_S=[]
    S= list(set(df['S'].values))
    p=1

    for s in S:
        sindex = df['S'] == s
        idx = [int(x) for x in 
            df.loc[sindex,'ID'].values]
        
        P += [p + int(i / idx[-1])
                for i, _ in enumerate(idx)]
        
        P_S += [int(i / idx[-1])
                for i, _ in enumerate(idx)]

        p = P[-1] + 1

    df['P']=P        
    df['P_S']=P_S        
    return df

            
    

In [16]:
df = fix_p(df)
df.head(150)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*
1,2,1,1,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*
2,3,1,1,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*
3,4,1,1,0,hoje,hoje,ADV,-,(ADVP*))),-,*)
4,5,1,1,0,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*)
5,6,1,1,0,um,um,ART,M|S,(NP*,-,(A1*
6,7,1,1,0,dado,dado,N,M|S,*,-,*
7,8,1,1,0,supreendente,surpreendente,ADJ,M|S,(ADJP*),-,*
8,9,1,1,0,:,-,PU,-,*,-,*
9,10,1,1,0,recusando,recusar,V-GER,-,(FCL(ICL(VP*),-,*


In [17]:
# dftest = df[ df['FORM'] == 'no_entanto']
# P in (870, 871, 872)
dftest = df[ df['S'] == 472]
# dftest.loc[21486:21516,:].head()
dftest.head(90)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
21491,1,472,870,0,Há,haver,V-FIN,PR|3S|IND,(FCL(VP*),haver,(V*)
21492,2,472,870,0,uma,um,ART,F|S,(NP-*,-,(A1*
21493,3,472,870,0,pequena,pequeno,ADJ,F|S,*,-,*
21494,4,472,870,0,parcela,parcela,N,F|S,*),-,*)
21495,5,472,870,0,",",-,PU,-,*,-,*
21496,6,472,870,0,no_entanto,no_entanto,ADV,-,(ADVP*),-,(AM-DIS*)
21497,7,472,870,0,",",-,PU,-,*,-,*
21498,8,472,870,0,que,que,PRON-INDP,F|S,(-NP(FCL(NP*),-,(C-A1*
21499,9,472,870,0,é,ser,V-FIN,PR|3S|IND,(VP*),-,*
21500,10,472,870,0,catastrofista,catastrofista,ADJ,F|S,(ADJP*),-,*


In [18]:
print(df.index)


RangeIndex(start=0, stop=138378, step=1, name='INDEX')


In [19]:
print(set(df['ARG'].values))

{'(AM-MNR*', '(AM-CAU*', '(C-A0*', '(AM-LOC*', '(C-AM-LOC*', '(AM-LOC*)', '(AM-PNC*)', '(AM-DIR*', '(C-A1*', '(C-A1*)', '(AM-REC*)', '(A2*', '(C-AM-TMP*)', '(C-A2*', '(C-AM-EXT*)', '(A1*)', '(A0*)', '(C-AM-PRD*', '(A2*)', '(AM-EXT*)', '(AM-NEG*', '(A3*', '(AM-DIS*)', '(AM-TMP*', '(C-A3*', '(C-AM-CAU*', '(C-AM-DIS*)', '(AM-CAU*)', '(C-A2*)', '(C-AM-NEG*)', '(C-AM-MNR*', '*)', '(AM-ADV*', '(AM-PNC*', '(AM-PRD*', '(A3*)', '(A1*', '(C-AM-ADV*', '(V*)', '(AM-NEG*)', '(C-V*)', '(A4*', '(AM-MNR*)', '(AM-EXT*', '(A0*', '(A4*)', '(C-AM-TMP*', '(AM-DIS*', '(C-V*', '(C-A0*)', '(AM-ADV*)', '(AM-TMP*)', '(AM-PRD*)', '(AM-REC*', '*'}


In [20]:
del test_columns[-1]

for label in test_columns:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        print('there are missing values on required fields')

In [21]:
df.to_csv('../datasets/csvs/1.0/gs.csv', encoding='utf-8')

In [22]:
testdf = df.loc[133236:133253,:]
testdf.head(100)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
133236,2,3165,5538,0,Se,se,CONJ-S,-,*,-,*
133237,3,3165,5538,0,for,ser,V-FIN,FUT|3S|SUBJ,(VP*),-,*
133238,4,3165,5538,0,uma,um,ART,F|S,(NP*,-,(A0*
133239,5,3165,5538,0,greve,greve,N,F|S,*,-,*
133240,6,3165,5538,0,só,só,ADV,-,(PP*,-,*
133241,7,3165,5538,0,por,por,PRP,-,*,-,*
133242,8,3165,5538,0,grevismo,grevismo,N,M|S,(NP*)))),-,*)
133243,9,3165,5538,0,",",-,PU,-,*,-,*
133244,10,3165,5538,0,de,de,PRP,-,(ADVP*,-,*
133245,11,3165,5538,0,aí,aí,ADV,-,*),-,*
