 # 1. DEFINE I/O DATA FRAMES

 ## Specifications can be found @
 > http://www.lsi.upc.edu/~srlconll/soft.html#srlconll

In [1]:
import sys
sys.path.append('../models/')
sys.path.append('../')

import numpy as np 
import pandas as pd 
import re 

from propbank import Propbank



MODEL_NAME='dblstm_crf_2'
HPARAMS_STR='lr1.00e-04_hs8x4_ctx-p1_glove_s50'
MODEL_V= '00'
INPUT_DIR= '../datasets/binaries/'
OUTPUT_DIR='../outputs/'

INPUT_PATH = '{:}{:}'.format(INPUT_DIR, 'db_pt_LEMMA_glove_s50.pickle')
OUTPUT_PATH= '{:}{:}/{:}/{:}/{:}'.format(OUTPUT_DIR,MODEL_NAME,HPARAMS_STR,MODEL_V, 'Yhat_valid.csv')
EVAL_PATH= '{:}{:}/{:}/{:}/{:}'.format(OUTPUT_DIR,MODEL_NAME,HPARAMS_STR,MODEL_V, 'conll_evaluate_valid.txt')
GOLD_PATH=  '{:}{:}/{:}/{:}/{:}'.format(OUTPUT_DIR,MODEL_NAME,HPARAMS_STR,MODEL_V, 'conll_golden_valid.txt')



print(INPUT_PATH)
print(OUTPUT_PATH)

../datasets/binaries/db_pt_LEMMA_glove_s50.pickle
../outputs/dblstm_crf_2/lr1.00e-04_hs8x4_ctx-p1_glove_s50/00/Yhat_valid.csv


In [2]:
# loads propbank data read binary 
propbank= Propbank.recover(INPUT_PATH)

#asks for FUNC column with encodings: (verbs in plain text)
S_d= propbank.feature('valid', 'S', True)
P_d=  propbank.feature('valid', 'P', True)
PRED_d= propbank.feature('valid', 'PRED', True)
ARG_d=  propbank.feature('valid', 'ARG', True)

# read the model's output
df = pd.read_csv(OUTPUT_PATH, sep=',', index_col=0)
Y_d= df.to_dict()['Y_ARG']
print(len(S_d), len(P_d), len(PRED_d), len(ARG_d), len(Y_d))
df.head()



12298 12298 12298 12298 12298


Unnamed: 0_level_0,Y_ARG,Y_T
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1
123846,(A0*,A0
123847,*,A0
123848,*),A0
123849,(V*),V
123850,(A1*,A1


 # 2. Outputs Conll

In [3]:
def outputs_conll_with_dicts(S, P, PRED, Y):
  '''
  Converts a dataset to conll format 
  '''
  d_conll={}
  index1=0  
  index0=0 # marks the beginning of a new SENTENCE
  prev_p=-1
  prev_s=-1
  pps=-1 #propostion per sentence  
  first=True  
  for index, s in S.items():
    p = P[index]
    pred = PRED[index]
    y = Y[index]
    if p != prev_p and s != prev_s: #New Sentence and new proposition
        pps=0  # fills ARG0  
        # conll format .: skip a row for each new sentence after the first
        if not(first): 
            for colname in d_conll:
                d_conll[colname][index1]=''
            index1+=1
        index0=index1 #Stores the beginning of the sentence                
    elif p != prev_p:#New proposition
        pps+=1  #  updates column to write
        index1=index0 # back to the first key
        
    argkey = 'ARG{:}'.format(pps)    
    if not(argkey in d_conll):
        if first:        
            d_conll['PRED']={}
            first=False
        d_conll[argkey]={}


    #updates predicate if index1 is unseen 
    if not(index1 in d_conll['PRED']) or not(pred =='-'):
        d_conll['PRED'][index1]=pred
        
    d_conll[argkey][index1]=y #            
    prev_p=p
    prev_s=s    
    index1+=1
  
  return d_conll        
        


 # 3. Make Evaluation

In [7]:

# Checking conll
d_eval = outputs_conll_with_dicts(S_d, P_d, PRED_d, Y_d)

#Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(d_eval , orient='columns') 
df= df[['PRED','ARG0','ARG1','ARG2','ARG3','ARG4','ARG5']]
df.to_csv(EVAL_PATH, sep= '\t', index=False, header=False)
df.head(30)

Unnamed: 0,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5
0,-,(A0*,,,,,
1,-,*,,,,,
2,-,*),,,,,
3,estar,(V*),,,,,
4,-,(A1*,,,,,
5,-,*,,,,,
6,-,*,,,,,
7,-,*,,,,,
8,-,*,,,,,
9,-,*,,,,,


 # 4. Make Gold

In [8]:
# Checking conll
d_gold = outputs_conll_with_dicts(S_d, P_d, PRED_d, ARG_d)

#Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(d_gold , orient='columns') 
df= df[['PRED','ARG0','ARG1','ARG2','ARG3','ARG4','ARG5']]

df.to_csv(GOLD_PATH, sep= '\t', index=False, header=False)
df.head(30)


Unnamed: 0,PRED,ARG0,ARG1,ARG2,ARG3,ARG4,ARG5
0,-,(A2*,,,,,
1,-,*,,,,,
2,-,*),,,,,
3,estar,(V*),,,,,
4,-,(A1*,,,,,
5,-,*,,,,,
6,-,*,,,,,
7,-,*,,,,,
8,-,*,,,,,
9,-,*,,,,,
