# 1. IMPORTS

In [1]:
#data_propbankbr lives within datasets
import sys
sys.path.append('../datasets/')

import numpy as np
import pandas as pd 
from data_propbankbr import propbankbr_parser, propbankbr_argument_stats, propbankbr_split

In [2]:
# This dataframe has all multiple arguments on columns ARG0 ~ARG6
df = propbankbr_parser()
df.head()

Unnamed: 0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,CTREE,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5,ARG6
0,1,1,0,0,Brasília,Brasília,PROP,F|S,5,ADVL,(FCL(NP*),-,*,*,*,*,,,
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,5,SUBJ,(NP*,-,(A0*,*,*,*,,,
2,3,1,0,0,publicada,publicar,V-PCP,F|S,2,N<,(ICL(VP*),-,*,*,*,*,,,
3,4,1,0,0,hoje,hoje,ADV,-,3,ADVL,(ADVP*))),-,*),*,*,*,,,
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,0,STA,(VP*),revelar,(V*),*,*,*,,,


 ### 1. 1 Normalization
Convert each record into a machine learning example i.e one argument per row
* Filter        .: each ARG column
* Create        .: tmp row holding the number of the argument ( for further ordering )
* Concatenate   .: everybody afterwards by rows
* Sort          .: reorder by S and TMP


In [3]:
columns_features= df.columns[:12]
df_feature= df.loc[:,columns_features].copy()

dataframes=[]
num_records=0
for i in range(7):
    col= 'ARG{:}'.format(i)
    df_target= df[col].dropna().to_frame()
    df_arg = df_feature.join(df_target, how='right')
    df_arg = df_arg.rename(index=str, columns={col: 'ARG'})
    df_arg['TMP']=i+1
    
    num_records+=df_arg.shape[0]
    print('{:}-\tnew records: {:}\ttotal records: {:}'.format(i, df_arg.shape[0],num_records))
    dataframes.append(df_arg)

#Concatente, sort by sentence & argument, reindex
df_norm = pd.concat(dataframes,axis=0)
df_norm.sort_values(by=['S','TMP'],axis=0 ,kind='mergesort', inplace=True)    
df_norm= df_norm.reset_index(drop=True)
df_norm.index.names=['INDEX']

0-	new records: 69760	total records: 69760
1-	new records: 42196	total records: 111956
2-	new records: 19346	total records: 131302
3-	new records: 7114	total records: 138416
4-	new records: 2357	total records: 140773
5-	new records: 717	total records: 141490
6-	new records: 240	total records: 141730


 ### 1.2 FIXING EXISTING FEATURES 
 #### 1.2.1 FIXING PREDICATE
  * __PRED__ must be non empty only when ARG is (V*) 
  * Drop column __TMP__

In [4]:
#Fix predicate 
index= (df_norm['TMP'] == df_norm['P_S'])
df_norm.loc[~index,'PRED']= '-'
df_norm.drop(labels='TMP', axis=1, inplace=True)
# DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
df_norm.head()

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,0,0,Brasília,Brasília,PROP,F|S,5,ADVL,(FCL(NP*),-,*
1,2,1,0,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,5,SUBJ,(NP*,-,(A0*
2,3,1,0,0,publicada,publicar,V-PCP,F|S,2,N<,(ICL(VP*),-,*
3,4,1,0,0,hoje,hoje,ADV,-,3,ADVL,(ADVP*))),-,*)
4,5,1,1,1,revela,revelar,V-FIN,PR|3S|IND,0,STA,(VP*),revelar,(V*)


 #### 1.2.2 FIXING P
  __P__ will be a range like id over the predicates.

In [5]:
# dfx= df_norm[df_norm['S'] == 3]
P=[]
P_S=[]
S= list(set(df_norm['S'].values))
p=1

for s in S:
    sindex=df_norm['S'] == s
    ind= [int(x) for x in 
        df_norm.loc[sindex,'ID'].values]
    p_s= [int(x) for x in 
        df_norm.loc[sindex,'P_S'].values]
    l =max(ind)
    n_p= len(set(filter(lambda x : x > 0, p_s)))
    # redefining p_s
    for p_s in range(n_p):
        P+= l*[p]
        P_S+= l*[p_s]
        p+=1

df_norm['P']=P        
df_norm['P_S']=P_S        
df_norm.head()                    

Unnamed: 0_level_0,ID,S,P,P_S,FORM,LEMMA,GPOS,MORF,DTREE,FUNC,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1,1,0,Brasília,Brasília,PROP,F|S,5,ADVL,(FCL(NP*),-,*
1,2,1,1,0,Pesquisa_Datafolha,Pesquisa_Datafolha,N,F|S,5,SUBJ,(NP*,-,(A0*
2,3,1,1,0,publicada,publicar,V-PCP,F|S,2,N<,(ICL(VP*),-,*
3,4,1,1,0,hoje,hoje,ADV,-,3,ADVL,(ADVP*))),-,*)
4,5,1,1,0,revela,revelar,V-FIN,PR|3S|IND,0,STA,(VP*),revelar,(V*)


# 2. NEW FEATURES

These features were originally  on (ZHOU, e XU, 2015) @:
    
[End-to-end Learning of Semantic Role Labeling Using Recurrent NeuralNetwork](http://www.aclweb.org/anthology/P15-1109)


### 2.1 M_R
m m_r= 1to denote the argument position if it lo-cates in the predicate context region, ormr= 0if not.

In [6]:
P= list(df_norm['P'].values)
M_R=[]
for p in P:
    ind_p=(df_norm['P'] == p)
    ind_p= (df_norm.loc[ind_p, 'PRED'] != '-').values
    M_R+=list(np.cumsum(ind_p))

M_R[:10]

[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

 ### 2.2 CTX_P

In [10]:
CTX_P={}
LAGS=list(filter(lambda x : x != 0,list(range(-3,4))))
P= list(set(df_norm['P'].values))
for lag in LAGS:        
    CTX_P[lag]=[]   

for p in P:    
    ind_p=(df_norm['P'] == p)
    n_p= sum(ind_p)
    
    ispredicate=(df_norm.loc[ind_p, 'PRED'] != '-').values    
    #Should be only one
    idx=min([i for i,x in 
                enumerate(ispredicate) if x])

    LEMMA_P=list((df_norm.loc[ind_p, 'LEMMA']).values)
    prev_lag=0
    for lag in LAGS:        
        if ((idx + lag) >-1) and ((idx + lag) < n_p):
            CTX_P[lag]+=[LEMMA_P[(idx + lag)]]*n_p 
        else:
            CTX_P[lag]+=['-']*n_p 

print(CTX_P[-3][:25])

['Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha', 'Pesquisa_Datafolha']


 ### 2.3 PRED
'Since a single predicate word can not exactly describe thepredicate  information,  especially  when  the  samewords appear more than one times in a sentence.With the expanded context, the ambiguity can belargely eliminated.' (ZHOU, et XU, 2015)

In [31]:
# deep_columns= ['M_R'] + [ 'CTX_P{:+}'.format(key) for key in CTX_P] 
P= list(set(df_norm['P'].values))
predindex=(df_norm.loc[:, 'PRED'] !=  '-').values
PRED_p= df_norm.loc[predindex, 'PRED'].values
PRED=[]

for p in P:
    ind_p=(df_norm['P'] == p)
    p_len = sum(ind_p)        
    PRED += [PRED_p[p-1]] * p_len

print(PRED[:35])    

['revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'revelar', 'recusar', 'recusar']
