In [1]:
import pandas as pd
from pathlib import Path
from updates_join_annotations import open_df_from_tsv
import updates_get_average_scores_per_label as scores


In [10]:
for index, filename in enumerate(Path('./sample_data').glob('**/*.tsv')):
    """
    Creates pd.DataFrame by joining files from different annotators and different documents to one
    large df
    """
    
    print(filename)
    # Extract annotator name from doc
    annotator = filename.stem
    
    # Use the first file to create df
    if index == 0:
        df = open_df_from_tsv(filename)
        
    # Update df with new files
    else: 
        # Create temporary df
    
        df_temp = open_df_from_tsv(filename)
    
        # if file is already in rows, and annotator is already in colmumns, then update
        if df_temp['file_id'][1] in set(df['file_id']) and f'labels_{annotator}' in df.columns:
            df.update(df_temp)
        # Elif file is in rows (and annotator not yet in columns), then concat with axis=1
        elif df_temp['file_id'][1] in set(df['file_id']):
            df_temp.drop(['token_d_id', 'token', 'file_id', 'sent_id', 'token_s_id'], axis=1, inplace=True)
            df = pd.concat([df, df_temp], axis=1, sort=False)
        # Else
        else:
            df = pd.concat([df, df_temp], join='inner')

    
    

sample_data/Meskers+wk_project_2020-07-24_1249/annotation/notities_2017_deel1_cleaned.csv---2503.conll/meskers.tsv
sample_data/Meskers+wk_project_2020-07-24_1249/annotation/notities_2017_deel1_cleaned.csv---2546.conll/meskers.tsv
sample_data/Meskers+wk_project_2020-07-24_1249/annotation/notities_2017_deel1_cleaned.csv---2570.conll/meskers.tsv
sample_data/Bos+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2503.conll/bos.tsv
sample_data/Bos+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2546.conll/bos.tsv
sample_data/Bos+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2570.conll/bos.tsv
sample_data/Avelli+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2503.conll/avelli.tsv
sample_data/Avelli+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2546.conll/avelli.tsv
sample_data/Avelli+wk_project_2020-07-24_1202/annotation/notities_2017_deel1_cleaned.csv---2570.conll/avelli

In [11]:
print(df.shape)
df.head()

(127, 12)


Unnamed: 0,token_d_id,token,temp,file_id,sent_id,token_s_id,labels_bos,relation_bos,temp.1,labels_avelli,relation_avelli,temp.2
2546_0,0,Op,_,2546,1,1,_,_,_,type\_Background[1],_,_
2546_1,1,Dd-mm-jjjj,_,2546,1,2,_,_,_,type\_Background[1],_,_
2546_2,2,zag,_,2546,1,3,_,_,_,type\_Background[1],_,_
2546_3,3,ik,_,2546,1,4,_,_,_,type\_Background[1],_,_
2546_4,4,bovengenoemde,_,2546,1,5,_,_,_,type\_Background[1],_,_


In [12]:
# Write to pickle
df.to_pickle('./sample_data/token_level_df_all_annotators_all_docs.pkl')

In [13]:
# To read file
# df = pd.read_pickle('./sample_data/token_level_df_all_annotators_all_docs.pkl')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 127 entries, 2546_0 to 2503_35
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   token_d_id       127 non-null    object
 1   token            127 non-null    object
 2   temp             127 non-null    object
 3   file_id          127 non-null    object
 4   sent_id          127 non-null    object
 5   token_s_id       127 non-null    object
 6   labels_bos       91 non-null     object
 7   relation_bos     91 non-null     object
 8   temp             91 non-null     object
 9   labels_avelli    127 non-null    object
 10  relation_avelli  127 non-null    object
 11  temp             127 non-null    object
dtypes: object(12)
memory usage: 12.9+ KB


In [4]:
## Testing on 3 annotators
df = pd.read_pickle('./sample_data/token_level_df_all_annotators_all_docs.pkl')
annotator_names = ['avelli', 'bos', 'meskers']
    
new_df = scores.get_dataframe(df, annotator_names)

score = 0
for annotator in annotator_names:
    try:
        dicta = dict()
        # get personal dicts per annotator
        dicta = scores.get_annotator_dict(df, 'labels_'+annotator)
        # score the annotations in the df
        new_df = scores.find_matches(new_df, annotator, dicta)
        score += new_df[annotator]
    except KeyError:
        print('KeyError: {} does not exist in the table.'.format(annotator))
        continue

# sum the scores and put in new column
new_df['score'] = score
new_df.to_excel('excel_with_scores.xlsx')

KeyError: meskers does not exist in the table.
<class 'pandas.core.frame.DataFrame'>
Index: 18 entries, type\_Background_2546_0_12 to _2503_0_35
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   label            18 non-null     object 
 1   file_id          18 non-null     object 
 2   begin_span       18 non-null     int64  
 3   end_span         18 non-null     int64  
 4   token            18 non-null     object 
 5   2nd_label        7 non-null      object 
 6   2nd_file_id      7 non-null      object 
 7   begin_2nd_label  7 non-null      float64
 8   end_2nd_label    7 non-null      float64
 9   unknown          7 non-null      object 
 10  avelli           18 non-null     int64  
 11  bos              18 non-null     int64  
 12  meskers          18 non-null     int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 2.0+ KB
KeyError: meskers does not exist in the table.


In [7]:
def print_scores(new_df):
# extract average scores per label
    print('average scores domains:')
    print('average score Stemming: '+ str(score.get_average_score_per_label(new_df, 'Stemming')))
    print('average score Lopen: '+ str(score.get_average_score_per_label(new_df, 'Lopen')))
    print('average score Beroep en Werk: '+str(score.get_average_score_per_label(new_df, 'Beroep')))
    print('average score Inspanningstolerantie: '+str(score.get_average_score_per_label(new_df, 'Inspanningstolerantie')))
    print()
    print('average scores levels:')
    print('average score FAC 0: '+ str(score.get_average_score_per_label(new_df, 'FAC 2')))
    print('average score FAC 1: '+ str(score.get_average_score_per_label(new_df, 'FAC 1')))
    print('average score FAC 2: '+ str(score.get_average_score_per_label(new_df, 'FAC 2')))
    print('average score FAC 3: '+ str(score.get_average_score_per_label(new_df, 'FAC 2')))
    print('average score FAC 4: '+ str(score.get_average_score_per_label(new_df, 'FAC 4')))
    print('average score FAC 5: '+ str(score.get_average_score_per_label(new_df, 'FAC 5')))
    print()
    print('average score STM 0: '+ str(score.get_average_score_per_label(new_df, 'STM 0')))
    print('average score STM 1: '+ str(score.get_average_score_per_label(new_df, 'STM 1')))
    print('average score STM 2: '+ str(score.get_average_score_per_label(new_df, 'STM 2')))
    print('average score STM 3: '+ str(score.get_average_score_per_label(new_df, 'STM 3')))
    print('average score STM 4: '+ str(score.get_average_score_per_label(new_df, 'STM 4')))
    print('average score STM 5: '+ str(score.get_average_score_per_label(new_df, 'STM 5')))
    print()
    print('average score INS 0: '+ str(score.get_average_score_per_label(new_df, 'INS 0')))
    print('average score INS 1: '+ str(score.get_average_score_per_label(new_df, 'INS 1')))
    print('average score INS 2: '+ str(score.get_average_score_per_label(new_df, 'INS 2')))
    print('average score INS 3: '+ str(score.get_average_score_per_label(new_df, 'INS 3')))
    print('average score INS 4: '+ str(score.get_average_score_per_label(new_df, 'INS 4')))
    print()
    print('average score BER 0: '+ str(score.get_average_score_per_label(new_df, 'BER 0')))
    print('average score BER 1: '+ str(score.get_average_score_per_label(new_df, 'BER 1')))
    print('average score BER 2: '+ str(score.get_average_score_per_label(new_df, 'BER 2')))
    print('average score BER 3: '+ str(score.get_average_score_per_label(new_df, 'BER 3')))
    print('average score BER 4: '+ str(score.get_average_score_per_label(new_df, 'BER 4')))
    print()
    print('average score remaining labels')
    print('average score stm\_reaction: '+ str(score.get_average_score_per_label(new_df, 'reaction')))
    print('average score type\_Background: '+ str(score.get_average_score_per_label(new_df, 'Background')))
    print('average score view\_Patient: '+ str(score.get_average_score_per_label(new_df, 'Patient')))
    print('average score view\_Thirdparty: '+ str(score.get_average_score_per_label(new_df, 'Third')))
    print('average score type\_Implicit: '+ str(score.get_average_score_per_label(new_df, 'Implicit')))

In [8]:
print_scores(new_df)

average scores domains:


AttributeError: 'Series' object has no attribute 'get_average_score_per_label'