# 1. IMPORTS

In [51]:
#data_propbankbr lives within datasets
import sys
sys.path.append('..')
sys.path.append('../datasets/')
sys.path.append('../datasets/scripts')

import re
import numpy as np
import pandas as pd 
from scripts.propbankbr import propbank_parser

REQUIRED_COLUMNS = ['ID', 'S', 'P', 'P_S', 'FORM', 'GPOS', 'CTREE', 'PRED', 'ARG0']
GS_COLUMNS = ['ID', 'S', 'P', 'P_S', 'FORM', 'GPOS', 'CTREE', 'PRED', 'ARG']

In [32]:
df = propbank_parser()

df.head()

Unnamed: 0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,...,ARG7,ARG8,ARG9,ARG10,ARG11,ARG12,ARG13,ARG14,ARG15,ARG16
0,1,0,0,1,In,IN,(S1(S(PP*,*,-,-,...,,,,,,,,,,
1,1,0,0,2,an,DT,(NP(NP*,*,-,-,...,,,,,,,,,,
2,1,0,0,3,Oct.,NNP,*,*,-,-,...,,,,,,,,,,
3,1,0,0,4,19,CD,*,*,-,-,...,,,,,,,,,,
4,1,0,0,5,review,NN,*),*,-,-,...,,,,,,,,,,


In [33]:
df.tail()

Unnamed: 0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,...,ARG7,ARG8,ARG9,ARG10,ARG11,ARG12,ARG13,ARG14,ARG15,ARG16
930274,41178,93998,4,26,them,PRP,(NP*),*,-,-,...,,,,,,,,,,
930275,41178,93998,4,27,here,RB,(ADVP*),*,-,-,...,,,,,,,,,,
930276,41178,93998,4,28,with,IN,(PP*,*,-,-,...,,,,,,,,,,
930277,41178,93998,4,29,us,PRP,(NP*))))),*,-,-,...,,,,,,,,,,
930278,41178,93998,4,30,.,.,*)),*,-,-,...,,,,,,,,,,


In [34]:
for label in REQUIRED_COLUMNS:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        pos_list = list(tmpdf.isnull().values)
        err_list = [i for i, p in enumerate(pos_list) if p]
        print('''missing values on {} fields:
                 positions:{}'''.format(label, err_list))


In [35]:
# dfvalid = df[ df['FORM'] == 'vantage'] S 39833
# dfvalid.head()
# P in (870, 871, 872)
dfvalid = df[ (df['S'] == 39832) | (df['S'] == 39833)]
# dftest.loc[21486:21516,:].head()
dfvalid.head(30)

Unnamed: 0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,...,ARG7,ARG8,ARG9,ARG10,ARG11,ARG12,ARG13,ARG14,ARG15,ARG16
898639,39832,90746,0,1,That,DT,(S1(S(NP*),*,-,-,...,,,,,,,,,,
898640,39832,90746,0,2,could,MD,(VP*,*,-,-,...,,,,,,,,,,
898641,39832,90747,1,3,cost,VB,(VP*,*,01,cost,...,,,,,,,,,,
898642,39832,90747,1,4,him,PRP,(NP*),*,-,-,...,,,,,,,,,,
898643,39832,90747,1,5,the,DT,(NP*,*,-,-,...,,,,,,,,,,
898644,39832,90747,1,6,chance,NN,*,*,-,-,...,,,,,,,,,,
898645,39832,90747,1,7,to,TO,(S(VP*,*,-,-,...,,,,,,,,,,
898646,39832,90748,2,8,influence,VB,(VP(VP*,*,01,influence,...,,,,,,,,,,
898647,39832,90748,2,9,the,DT,(NP*,*,-,-,...,,,,,,,,,,
898648,39832,90748,2,10,outcome,NN,*)),*,-,-,...,,,,,,,,,,


 ### 1. 1 Normalization
Convert each record into a machine learning example i.e one argument per row
* Filter        .: each ARG column
* Create        .: tmp row holding the number of the argument ( for further ordering )
* Concatenate   .: everybody afterwards by rows
* Sort          .: reorder by S and TMP


In [36]:
def unstack_df(df):
    '''
        Unstack outputs
        args:
            df :DataFrame ConLL flat tree format
        returns:
            df :DataFrame
    '''
    column_arg0 = list(df.columns).index('ARG0')
    columns_features= df.columns[:column_arg0]
    df_feature= df.loc[:,columns_features].copy()

    dataframes=[]
    num_records=0
    for i in range(17):
        col= 'ARG{:}'.format(i)
        df_target= df[col].dropna().to_frame()
        df_arg = df_feature.join(df_target, how='right')
        df_arg = df_arg.rename(index=str, columns={col: 'ARG'})
        df_arg['TMP']=i+1
    
        num_records+=df_arg.shape[0]
        print('{:}-\tnew records: {:}\ttotal records: {:}'.format(i, df_arg.shape[0],num_records))
        dataframes.append(df_arg)

    #Concatente, sort by sentence & argument, reindex
    df = pd.concat(dataframes,axis=0)
    df.sort_values(by=['S','TMP'],axis=0 ,kind='mergesort', inplace=True)    
    df = df.reset_index(drop=True)
    df.index.names=['INDEX']
    return df

In [37]:
df = unstack_df(df)


0-	new records: 930279	total records: 930279
1-	new records: 748217	total records: 1678496
2-	new records: 501430	total records: 2179926
3-	new records: 277463	total records: 2457389
4-	new records: 131851	total records: 2589240
5-	new records: 55207	total records: 2644447
6-	new records: 22129	total records: 2666576
7-	new records: 8368	total records: 2674944
8-	new records: 3737	total records: 2678681
9-	new records: 1419	total records: 2680100
10-	new records: 689	total records: 2680789
11-	new records: 372	total records: 2681161
12-	new records: 141	total records: 2681302
13-	new records: 141	total records: 2681443
14-	new records: 141	total records: 2681584
15-	new records: 141	total records: 2681725
16-	new records: 141	total records: 2681866


In [38]:
df.head()

Unnamed: 0_level_0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,ARG,TMP
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,0,0,1,In,IN,(S1(S(PP*,*,-,-,*,1
1,1,0,0,2,an,DT,(NP(NP*,*,-,-,*,1
2,1,0,0,3,Oct.,NNP,*,*,-,-,*,1
3,1,0,0,4,19,CD,*,*,-,-,*,1
4,1,0,0,5,review,NN,*),*,-,-,*,1


In [39]:
test_columns = list(REQUIRED_COLUMNS)
test_columns[-1] = 'ARG'
test_columns.append('TMP')
for label in test_columns:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        print('there are missing values on required fields')

In [40]:
# dftest = df[ df['FORM'] == 'no_entanto']
# P in (870, 871, 872)
dftest = df[ df['S'] == 400]
# dftest.loc[21486:21516,:].head()
dftest.head(30)

Unnamed: 0_level_0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,ARG,TMP
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
25986,400,886,0,1,For,IN,(S1(S(PP*,*,-,-,*,1
25987,400,886,0,2,the,DT,(NP(NP*,*,-,-,(A1*,1
25988,400,886,0,3,nine,CD,*,*,-,-,*,1
25989,400,886,0,4,months,NNS,*),*,-,-,*),1
25990,400,887,1,5,ended,VBN,(VP*,*,01,end,(V*),1
25991,400,887,1,6,July,NNP,(NP*,*,-,-,(AM-TMP*,1
25992,400,887,1,7,29,CD,*)))),*,-,-,*),1
25993,400,887,1,8,",",",",*,*,-,-,*,1
25994,400,887,1,9,SFE,NNP,(NP*,(ORG*,-,-,*,1
25995,400,887,1,10,Technologies,NNPS,*),*),-,-,*,1


 ### 1.2 FIXING EXISTING FEATURES 
 #### 1.2.1 FIXING PREDICATE
  * __PRED__ must be non empty only when ARG is (V*) 
  * Drop column __TMP__

In [41]:
def fix_pred(df):
    index= (df['TMP'] == df['P_S'])
    df.loc[~index,'PRED']= '-'
    df = df.drop(labels='TMP', axis=1)
    return df

In [42]:
#Fix predicate 
df = fix_pred(df)
df.head(30)

Unnamed: 0_level_0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,0,0,1,In,IN,(S1(S(PP*,*,-,-,*
1,1,0,0,2,an,DT,(NP(NP*,*,-,-,*
2,1,0,0,3,Oct.,NNP,*,*,-,-,*
3,1,0,0,4,19,CD,*,*,-,-,*
4,1,0,0,5,review,NN,*),*,-,-,*
5,1,0,0,6,of,IN,(PP*,*,-,-,*
6,1,0,0,7,``,'',*,*,-,-,*
7,1,0,0,8,The,DT,(NP(NP*,*,-,-,*
8,1,0,0,9,Misanthrope,NN,*),*,-,-,*
9,1,0,0,10,'','',*,*,-,-,*


In [56]:
dftest = df[ df['FORM'] == 'vantage']
# P in (870, 871, 872)
# dftest = df[ df['S'] == 472]
# dftest.loc[21486:21516,:].head()
dftest.head(30)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,GPOS,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2512758,4,38694,88095,0,vantage,NN,*,-,*
2587112,10,39833,90751,0,vantage,NN,*,-,*


 #### 1.2.2 FIXING P
  __P__ will be a range like id over the predicates.

In [44]:
def fix_p(df):
    P=[]
    P_S=[]
    S= list(set(df['S'].values))
    p=1

    for s in S:
        sindex = df['S'] == s
        idx = [int(x) for x in 
            df.loc[sindex,'ID'].values]
        
        P += [p + int(i / idx[-1])
                for i, _ in enumerate(idx)]
        
        P_S += [int(i / idx[-1])
                for i, _ in enumerate(idx)]

        p = P[-1] + 1

    df['P']=P        
    df['P_S']=P_S        
    return df

            
    

In [45]:
df = fix_p(df)
df.head(150)

Unnamed: 0_level_0,S,P,P_S,ID,FORM,GPOS,CTREE,NER,SENSES,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,0,1,In,IN,(S1(S(PP*,*,-,-,*
1,1,1,0,2,an,DT,(NP(NP*,*,-,-,*
2,1,1,0,3,Oct.,NNP,*,*,-,-,*
3,1,1,0,4,19,CD,*,*,-,-,*
4,1,1,0,5,review,NN,*),*,-,-,*
5,1,1,0,6,of,IN,(PP*,*,-,-,*
6,1,1,0,7,``,'',*,*,-,-,*
7,1,1,0,8,The,DT,(NP(NP*,*,-,-,*
8,1,1,0,9,Misanthrope,NN,*),*,-,-,*
9,1,1,0,10,'','',*,*,-,-,*


In [58]:
# dftest = df[ df['FORM'] == 'vantage'] P 90751
# P in (870, 871, 872)
dftest = df[ (df['P'] == 90750) | (df['P'] == 90751) ] # train-set dev-set
# dftest.loc[21486:21516,:].head()
dftest.head(90)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,GPOS,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2587086,1,39832,90750,3,That,DT,(S1(S(NP*),-,*
2587087,2,39832,90750,3,could,MD,(VP*,-,*
2587088,3,39832,90750,3,cost,VB,(VP*,-,*
2587089,4,39832,90750,3,him,PRP,(NP*),-,*
2587090,5,39832,90750,3,the,DT,(NP*,-,*
2587091,6,39832,90750,3,chance,NN,*,-,*
2587092,7,39832,90750,3,to,TO,(S(VP*,-,*
2587093,8,39832,90750,3,influence,VB,(VP(VP*,-,*
2587094,9,39832,90750,3,the,DT,(NP*,-,*
2587095,10,39832,90750,3,outcome,NN,*)),-,*


In [59]:
print(df.index)


RangeIndex(start=0, stop=2681866, step=1, name='INDEX')


In [49]:
print(set(df['ARG'].values))

{'(C-A2*)', '(R-AM-EXT*)', '(AM-PRD*', '(AM-MNR*)', '(C-AM-NEG*)', '(R-AM-PNC*', '*', '(C-AM-DIR*', '(C-V*)', '(AM-LOC*)', '(AM-DIS*)', '(R-A1*)', '(C-AM-ADV*', '(R-AM-PNC*)', '(AM-MOD*', '(R-AM-EXT*', '(A0*', '(R-A0*)', '(AM-PRD*)', '(C-AM-LOC*', '(AM-CAU*', '(C-A0*)', '(C-AM-PNC*', '(C-A1*', '(R-AM-LOC*', '(V*)', '(V*', '(AM-LOC*', '(AM-TM*', '(A4*)', '(C-V*', '(R-AM-ADV*', '(AM-ADV*)', '(A0*)', '(AM-ADV*', '(A3*', '(R-A0*', '(AM-DIS*', '(A2*)', '(AM*', '(C-AM-CAU*', '(C-AM-EXT*)', '(C-AM-MNR*', '(AM-DIR*', '(AM-REC*)', '(R-AM-TMP*', '(AM-DIR*)', '*)', '(R-AM-MNR*)', '(R-A3*', '(A1*)', '(A1*', '(AM-EXT*', '(C-AM-EXT*', '(AM-TMP*)', '(R-A3*)', '(A5*', '(R-A4*)', '(A2*', '(AM-NEG*', '(C-A5*', '(C-AM-TMP*', '(C-A3*', '(AM-REC*', '(C-A0*', '(AM-PNC*)', '(R-AM-CAU*)', '(C-A1*)', '(R-AM-TMP*)', '(C-AM-DIS*', '(A3*)', '(AA*)', '(C-AM-DIS*)', '(R-AA*)', '(R-A2*)', '(A5*)', '(AM-TMP*', '(R-AM-LOC*)', '(C-A4*', '(AM-EXT*)', '(AM-CAU*)', '(AM-MOD*)', '(AM-NEG*)', '(A4*', '(R-AM-DIR*', '(AM-PNC*

In [50]:
del test_columns[-1]

for label in test_columns:
    tmpdf = df[label].to_frame()
    if tmpdf.isnull().values.any():
        print('there are missing values on required fields')

In [54]:
df = df.loc[:, GS_COLUMNS]
df.to_csv('../datasets/csvs/en/gs.csv', encoding='utf-8')

In [55]:
testdf = df.loc[133236:133253,:]
testdf.head(100)

Unnamed: 0_level_0,ID,S,P,P_S,FORM,GPOS,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
133236,5,1952,4507,1,have,AUX,(S(VP*,-,*
133237,6,1952,4507,1,conducted,VBN,(VP*,-,*
133238,7,1952,4507,1,hours,NNS,(NP(NP*),-,*
133239,8,1952,4507,1,of,IN,(PP*,-,*
133240,9,1952,4507,1,tests,NNS,(NP*))),-,*
133241,10,1952,4507,1,on,IN,(PP*,-,*
133242,11,1952,4507,1,themselves,PRP,(NP*))))))),-,*)
133243,12,1952,4507,1,report,VBP,(VP*,report,(V*)
133244,13,1952,4507,1,temporary,JJ,(NP*,-,(A1*
133245,14,1952,4507,1,headaches,NNS,*)),-,*)


In [60]:
df.tail()

Unnamed: 0_level_0,ID,S,P,P_S,FORM,GPOS,CTREE,PRED,ARG
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2681861,26,41178,93998,3,them,PRP,(NP*),-,(A1*)
2681862,27,41178,93998,3,here,RB,(ADVP*),-,(AM-LOC*
2681863,28,41178,93998,3,with,IN,(PP*,-,*
2681864,29,41178,93998,3,us,PRP,(NP*))))),-,*)
2681865,30,41178,93998,3,.,.,*)),-,*


In [61]:
print(93998-90750)

3248
