# 1. IMPORTS

In [1]:
#data_propbankbr lives within datasets
import sys
sys.path.append('../datasets/')
sys.path.append('../datasets/scripts')

import re
import numpy as np
import pandas as pd 
from scripts.propbankbr import propbankbr_parser

In [2]:
# This dataframe has all multiple arguments on columns ARG0 ~ARG6
df = propbankbr_parser('1.0')
print(df.shape)
df.head()


(69122, 17)


Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*,*,*,*,,,
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*,*,*,*,,,
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*,*,*,*,,,
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*),*,*,*,,,
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*),*,*,*,,,


In [3]:
df.tail()

Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
69117,31,3308,5776,2,de,de,PRP,-,(PP*,-,*,*,,,,,
69118,32,3308,5776,2,J.R.Duran,J.R.Duran,PROP,M|S,(NP*,-,*,*,,,,,
69119,33,3308,5776,2,",",-,PU,-,*,-,*,*,,,,,
69120,34,3308,5776,2,Alexandra_Brochen,Alexandra_Brochen,PROP,F|S,(NP*)))),-,*,*,,,,,
69121,35,3308,5776,2,.,-,PU,-,*),-,*,*,,,,,


 ### 1. 1 Normalization
Convert each record into a machine learning example i.e one argument per row
* Filter        .: each ARG column
* Create        .: tmp row holding the number of the argument ( for further ordering )
* Concatenate   .: everybody afterwards by rows
* Sort          .: reorder by S and TMP


In [4]:
def unstack_df(df):
    '''
        Unstack outputs
        args:
            df :DataFrame ConLL flat tree format
        returns:
            df :DataFrame
    '''
    column_arg0 = list(df.columns).index('ARG0')
    columns_features= df.columns[:column_arg0]
    df_feature= df.loc[:,columns_features].copy()

    dataframes=[]
    num_records=0
    for i in range(7):
        col= 'ARG{:}'.format(i)
        df_target= df[col].dropna().to_frame()
        df_arg = df_feature.join(df_target, how='right')
        df_arg = df_arg.rename(index=str, columns={col: 'ARG'})
        df_arg['TMP']=i+1
    
        num_records+=df_arg.shape[0]
        print('{:}-\tnew records: {:}\ttotal records: {:}'.format(i, df_arg.shape[0],num_records))
        dataframes.append(df_arg)

    #Concatente, sort by sentence & argument, reindex
    df = pd.concat(dataframes,axis=0)
    df.sort_values(by=['S','TMP'],axis=0 ,kind='mergesort', inplace=True)    
    df = df.reset_index(drop=True)
    df.index.names=['INDEX']
    return df

In [5]:
df = unstack_df(df)


0-	new records: 69122	total records: 69122
1-	new records: 40974	total records: 110096
2-	new records: 18593	total records: 128689
3-	new records: 6679	total records: 135368
4-	new records: 2254	total records: 137622
5-	new records: 572	total records: 138194
6-	new records: 178	total records: 138372


In [6]:
df.head()

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG,TMP
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*,1
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*,1
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*,1
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*),1
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*),1


 ### 1.2 FIXING EXISTING FEATURES 
 #### 1.2.1 FIXING PREDICATE
  * __PRED__ must be non empty only when ARG is (V*) 
  * Drop column __TMP__

In [7]:
def fix_pred(df):
    index= (df['TMP'] == df['P_S'])
    df.loc[~index,'PRED']= '-'
    df = df.drop(labels='TMP', axis=1)
    return df

In [8]:
#Fix predicate 
df = fix_pred(df)
df.head(30)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,0,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*
2,3,1,0,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*
3,4,1,0,0,hoje,hoje,ADV,-,(ADVP*))),-,*)
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*)
5,6,1,1,1,um,um,ART,M|S,(NP*,-,(A1*
6,7,1,1,1,dado,dado,N,M|S,*,-,*
7,8,1,1,1,supreendente,surpreendente,ADJ,M|S,(ADJP*),-,*
8,9,1,1,1,:,-,PU,-,*,-,*
9,10,1,2,2,recusando,recusar,V-GER,-,(FCL(ICL(VP*),-,*


In [9]:
df.tail(35)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
138337,1,3308,5774,0,Além_de,além_de,PRP,-,(FCL(PP*,-,*
138338,2,3308,5774,0,Mauro_Salles,Mauro_Salles,PROP,M|S,(NP*,-,*
138339,3,3308,5774,0,que,que,PRON-INDP,M|S,(FCL(NP*),-,*
138340,4,3308,5775,1,surpreendeu,surpreender,V-FIN,PS|3S|IND,(VP*),-,*
138341,5,3308,5775,1,a,o,ART,F|S,(NP*,-,*
138342,6,3308,5775,1,galera,galera,N,F|S,*),-,*
138343,7,3308,5775,1,a,a,PRP,-,(PP*,-,*
138344,8,3308,5775,1,o,o,ART,M|S,(ICL*,-,*
138345,9,3308,5776,2,revelar,revelar,V-INF,-,(VP*),revelar,(V*)
138346,10,3308,5776,2,seu,seu,PRON-DET,M|S,(NP*,-,(A1*


 #### 1.2.2 FIXING P
  __P__ will be a range like id over the predicates.

In [10]:
def fix_p(df):
    P=[]
    P_S=[]
    S= list(set(df['S'].values))
    p=1

    for s in S:
        sindex = df['S'] == s
        idx = [int(x) for x in 
            df.loc[sindex,'ID'].values]
        
        P += [p + int(i / idx[-1])
                for i, _ in enumerate(idx)]
        
        P_S += [int(i / idx[-1])
                for i, _ in enumerate(idx)]

        p = P[-1]

    df['P']=P        
    df['P_S']=P_S        
    return df

            
    

In [11]:
df = fix_p(df)
df.head(35)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,(FCL(NP*),-,*
1,2,1,1,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,(NP*,-,(A0*
2,3,1,1,0,publicada,publicar,V-PCP,F|S,(ICL(VP*),-,*
3,4,1,1,0,hoje,hoje,ADV,-,(ADVP*))),-,*)
4,5,1,1,0,revela,revelar,V-FIN,PR|3S|IND,(VP*),revelar,(V*)
5,6,1,1,0,um,um,ART,M|S,(NP*,-,(A1*
6,7,1,1,0,dado,dado,N,M|S,*,-,*
7,8,1,1,0,supreendente,surpreendente,ADJ,M|S,(ADJP*),-,*
8,9,1,1,0,:,-,PU,-,*,-,*
9,10,1,1,0,recusando,recusar,V-GER,-,(FCL(ICL(VP*),-,*


In [17]:
df1.tail(35)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
138337,1,3308,2469,1,Além_de,além_de,PRP,-,(FCL(PP*,-,*
138338,2,3308,2469,1,Mauro_Salles,Mauro_Salles,PROP,M|S,(NP*,-,*
138339,3,3308,2469,1,que,que,PRON-INDP,M|S,(FCL(NP*),-,*
138340,4,3308,2469,1,surpreendeu,surpreender,V-FIN,PS|3S|IND,(VP*),-,*
138341,5,3308,2469,1,a,o,ART,F|S,(NP*,-,*
138342,6,3308,2469,1,galera,galera,N,F|S,*),-,*
138343,7,3308,2469,1,a,a,PRP,-,(PP*,-,*
138344,8,3308,2469,1,o,o,ART,M|S,(ICL*,-,*
138345,9,3308,2469,1,revelar,revelar,V-INF,-,(VP*),revelar,(V*)
138346,10,3308,2469,1,seu,seu,PRON-DET,M|S,(NP*,-,(A1*


In [15]:
print(df.index)


RangeIndex(start=0, stop=138372, step=1, name='INDEX')


In [16]:
df.to_csv('../datasets/csvs/1.0/gs.csv', encoding='utf-8')