In [1]:

import pandas as pd
import pickle
timestamp = "run_2021_01_11_17_00_30_ALL_8995431_r2n07_all"
epoch = "499"
filename = f"/Users/elliotschumacher/Dropbox/git/clel/results/{timestamp}/eval_{epoch}.xlsx"
data_file = "/Users/elliotschumacher/Dropbox/git/clel/datasets/LDC2019T02/ENG/tac_kbp_2015_tedl_training_gold_standard_entity_mentions.tab"

df = pd.read_excel(filename)
header_list = ["system_id", "query_id", "mention_string", "doc_id_offsets", "link_id", "entity_type", "mention_type",
               "confidence", "web_search", "wiki_text", "unknown"]
data = pd.read_csv(data_file, sep="\t", names=header_list, header=None,)

eval_mentions = set()
for i, row_a in df.iterrows():
    eval_mentions.add(row_a['~~mention_uuid'])

with open(f'/Users/elliotschumacher/Dropbox/git/clel/results/{timestamp}/training_ENG.pkl', 'rb') as pkl_f:
    training_queries = pickle.load(pkl_f)
# only en training
#data = data[data.query_id.isin(training_queries)]

# all training
data = data[~data.query_id.isin(eval_mentions)]
data = data[~data['link_id'].str.startswith('NIL')]


popularity = pd.pivot_table(data, index=["link_id"],
                            values="query_id",
                             aggfunc="count").reset_index().sort_values(by="query_id", ascending=False)
popularity

Unnamed: 0,link_id,query_id
1219,m.09c7w0,1913
1349,m.0d05w3,695
397,m.027x630,592
1351,m.0d06m5,585
628,m.03h64,552
...,...,...
1003,m.06g3tb,1
352,m.025svn,1
1005,m.06hrk,1
1007,m.06hx2,1


In [2]:
import re
regex = re.compile(r'\(([0-9.-]+)\)')

pop_correct = 0
orig_correct = 0
total = 0
changed = 0
from codebase import el_scorer
import os
fn_path = os.path.dirname(filename)
os.makedirs(os.path.join(fn_path,"orig"), exist_ok=True)
os.makedirs(os.path.join(fn_path,"pop"), exist_ok=True)

gold_file = open(os.path.join(fn_path,"gold.tac"), 'w')
orig_file = open(os.path.join(fn_path,"orig", "-1"), 'w')
pop_file = open(os.path.join(fn_path,"pop", "-1"), 'w')

from collections import defaultdict
orig_dict = {}
pop_dict = {}
gold_dict = {}
for lang in ["SPA", "ENG", "CMN"]:
    os.makedirs(os.path.join(fn_path,f"orig_{lang}"), exist_ok=True)
    os.makedirs(os.path.join(fn_path,f"pop_{lang}"), exist_ok=True)

    orig_dict[lang] = open(os.path.join(fn_path,f"orig_{lang}", "-1"), 'w')
    pop_dict[lang] = open(os.path.join(fn_path,f"pop_{lang}", "-1"), 'w') 
    gold_dict[lang] = open(os.path.join(fn_path,f"gold_{lang}.tac"), 'w') 

for i, row in df.iterrows():
    top10 = []
    score10 = []
    lang = os.path.basename(row['~~comm'])[:3]
    for j in range(10):
        cn = f"~pred_cuis_{j}"
        if type(row[cn]) is not float:
            cui, rest = row[cn].split("=", 1)
            score = float("-inf")
            if "(-inf)" not in rest:
                score = float(regex.findall(rest)[-1])
                score10.append(score)
                top10.append(cui)
    gold = row["_gold_kbid"]
    these_pop = popularity[popularity["link_id"].isin(top10)].sort_values(by="query_id", ascending=False)
    
    if len(top10) > 0:
        predicted = top10[0]
        original = top10[0]

    else:
        predicted = "NIL"
        original = "NIL"
    
    if len(these_pop) > 0:
        predicted = these_pop["link_id"].iloc[0]

    if predicted == gold:
        pop_correct += 1.
    
    if original == gold:
        orig_correct += 1.
    if predicted != original:
        changed += 1.
    total += 1.
    gold_file.write(f"{row['~~mention_uuid']} {gold}\n")
    orig_file.write(f"{row['~~mention_uuid']} {original}\n")
    pop_file.write(f"{row['~~mention_uuid']} {predicted}\n")
    
    gold_dict[lang].write(f"{row['~~mention_uuid']} {gold}\n")
    orig_dict[lang].write(f"{row['~~mention_uuid']} {original}\n")
    pop_dict[lang].write(f"{row['~~mention_uuid']} {predicted}\n")

print(pop_correct/total)
print(orig_correct/total)

print(changed / total)
gold_file.close()
orig_file.close()
pop_file.close()


orig_results = el_scorer.systemsRankingScript(goldStdFile= os.path.join(fn_path,"gold.tac"),
                               systemsDir= os.path.join(fn_path, 'orig'),
                               focusElFile= os.path.join(fn_path,"gold.tac"))
orig_df = pd.DataFrame.from_dict(orig_results)
orig_df['system_name'] = "original"
pop_results = el_scorer.systemsRankingScript(goldStdFile= os.path.join(fn_path,"gold.tac"),
                               systemsDir= os.path.join(fn_path, 'pop'),
                               focusElFile= os.path.join(fn_path,"gold.tac"))
pop_df = pd.DataFrame.from_dict(pop_results)
pop_df['system_name'] = "popular"


combined_df = pd.concat([orig_df, pop_df])
combined_df.sort_values('system_name')
combined_df.to_csv(os.path.join(fn_path, 'comparison.csv'), index=False,)



for lang in ["ENG", "SPA", "CMN"]:
    gold_dict[lang].close()
    orig_dict[lang].close()
    pop_dict[lang].close()
    
    orig_results = el_scorer.systemsRankingScript(goldStdFile= os.path.join(fn_path,f"gold_{lang}.tac"),
                               systemsDir= os.path.join(fn_path, f'orig_{lang}'),
                               focusElFile= os.path.join(fn_path,f"gold_{lang}.tac"))
    orig_df = pd.DataFrame.from_dict(orig_results)
    orig_df['system_name'] = "original"
    pop_results = el_scorer.systemsRankingScript(goldStdFile= os.path.join(fn_path,f"gold_{lang}.tac"),
                                   systemsDir= os.path.join(fn_path, f'pop_{lang}'),
                                   focusElFile= os.path.join(fn_path,f"gold_{lang}.tac"))
    pop_df = pd.DataFrame.from_dict(pop_results)
    pop_df['system_name'] = "popular"
    
    
    combined_df = pd.concat([orig_df, pop_df])
    combined_df.sort_values('system_name')
    combined_df.to_csv(os.path.join(fn_path, f'comparison_{lang}.csv'), index=False,)


0.6547788873038516
0.4818830242510699
0.22353780313837376


In [3]:
#timestamp = "run_2020_05_08_23_03_35_ALL_r2n03"
#epoch = "249"
filename_aux = f"/Users/elliotschumacher/Dropbox/git/clel/results/{timestamp}/eval_{epoch}.xlsx"

df_aux = pd.read_excel(filename_aux)
pop_correct = 0
orig_correct = 0
total = 0
changed = 0
aux_oracle_agreement = 0

changed_rows = []
score_regex = re.compile(r"(\s\(.*\)$)", re.IGNORECASE)
for i, row in df.iterrows():
    aux_row = df_aux[df_aux['~~mention_uuid'] == row['~~mention_uuid']] 
    aux_top10 = []
    for j in range(10):
        cn = f"~pred_cuis_{j}"
        if type(row[cn]) is not float:
    
            cui, rest = row[cn].split("=", 1)
            score = float("-inf")
            if "(-inf)" not in rest:
                score = float(regex.findall(rest)[-1])
                aux_top10.append({"cui": cui, "score" : score, "name" : rest})
    if len(aux_top10) > 0:
        aux_prediction = aux_top10[0]["cui"]
    else:
        aux_prediction = "NIL"
                
    
    top10 = []
    lang = os.path.basename(row['~~comm'])[:3]
    for j in range(10):
        cn = f"~pred_cuis_{j}"
        if type(row[cn]) is not float:
    
            cui, rest = row[cn].split("=", 1)
            score = float("-inf")
            if "(-inf)" not in rest:
                score = float(regex.findall(rest)[-1])
                top10.append({"cui": cui, "score" : score, "name" : rest})
    gold = row["_gold_kbid"]
    these_pop = popularity[popularity["link_id"].isin([x["cui"] for x in top10])].sort_values(by="query_id", ascending=False)
    
    if len(top10) > 0:
        predicted = top10[0]["cui"]
        original = top10[0]["cui"]
        predicted_name = top10[0]["name"]

    else:
        predicted = "NIL"
        original = "NIL"
        predicted_name = "NIL"
    
    if len(these_pop) > 0:
        predicted = these_pop["link_id"].iloc[0]
        

    if predicted == gold:
        pop_correct += 1.
    
    if original == gold:
        orig_correct += 1.
        
    if predicted != original:
        changed += 1.
        ch_row = row.to_dict()
        pop_name = [x["name"] for x in top10 if x["cui"] == predicted][0]
        pop_name = re.sub(score_regex, "", pop_name)
        orig_name = re.sub(score_regex, "", top10[0]["name"])
        ch_row.update({
            "original_name" : orig_name,
            "original_cui" : top10[0]["cui"],
            "popular_cui" : predicted,
            "popular_name" : pop_name
        })
        changed_rows.append(ch_row)
        if aux_prediction == predicted:
            aux_oracle_agreement += 1.
    total += 1.
print(changed/total)
print(aux_oracle_agreement / changed)

changed_df = pd.DataFrame.from_dict(changed_rows)




0.22353780313837376
0.0


In [4]:
changed_df.to_csv(os.path.join(fn_path, f'changed.csv'), index=False,)




In [5]:
changed_correct_df = changed_df[changed_df["popular_cui"] == changed_df["_gold_kbid"]]
changed_correct_df["pop_freq"] = changed_correct_df.groupby('popular_cui')['popular_cui'].transform('count')
changed_correct_df["lang"] = changed_correct_df.apply(lambda r: os.path.basename(r["~~comm"]).split('_')[0], axis = 1) 
changed_correct_df.to_csv(os.path.join(fn_path, f'changed_correct.csv'), index=False,)
changed_correct_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,_text,_sentence,_gold_kbid,_gold_cui_rank,_gold_cui_score,_num_kb_cands,~~mention_uuid,~~comm,~pred_cuis_0,~pred_cuis_1,...,no_cands,accuracy,max_accuracy,min_accuracy,original_name,original_cui,popular_cui,popular_name,pop_freq,lang
3,Pistorius,"<DOC id=""SPA_NW_001096_20140320_F0000003N"">\n<...",m.0f1lfn,6.0,0.088196,157,TEDL15_TRAINING_10476,/exp/eschumacher/clel_data/LDC2019T02/SPA/trai...,m.0khp_px=Carl Pistorius (0.88),m.0khp_pk=Aimée Pistorius (0.78),...,0.0,0.0,0.0,0.0,Carl Pistorius,m.0khp_px,m.0f1lfn,Oscar Leonard Carl Pistorius,26,SPA
4,Oscar Pistorius,"<DOC id=""SPA_NW_001096_20140320_F0000003N"">\n<...",m.0f1lfn,5.0,-0.049316,111,TEDL15_TRAINING_10477,/exp/eschumacher/clel_data/LDC2019T02/SPA/trai...,m.0khp_px=Carl Pistorius (0.99),m.0v468p5=Sheila Pistorius (0.72),...,0.0,0.0,0.0,0.0,Carl Pistorius,m.0khp_px,m.0f1lfn,Oscar Leonard Carl Pistorius,26,SPA
5,Pistorius,</P>\n<P>\n“Es necesario vender la casa de Pis...,m.0f1lfn,6.0,-0.650391,157,TEDL15_TRAINING_10478,/exp/eschumacher/clel_data/LDC2019T02/SPA/trai...,m.0khp_px=Carl Pistorius (0.85),m.0v468p5=Sheila Pistorius (0.83),...,0.0,0.0,0.0,0.0,Carl Pistorius,m.0khp_px,m.0f1lfn,Oscar Leonard Carl Pistorius,26,SPA
6,Pistorius,"</P>\n<P>\nCon sus prótesis de carbono, dos cu...",m.0f1lfn,7.0,-0.712891,157,TEDL15_TRAINING_10479,/exp/eschumacher/clel_data/LDC2019T02/SPA/trai...,m.0khp_px=Carl Pistorius (0.98),m.0cg1xfw=Boris Pistorius (0.04),...,0.0,0.0,0.0,0.0,Carl Pistorius,m.0khp_px,m.0f1lfn,Oscar Leonard Carl Pistorius,26,SPA
7,Pistorius,"</P>\n<P>\nDesde que ocurrió dicho tiroteo, Pi...",m.0f1lfn,8.0,-0.629883,157,TEDL15_TRAINING_10480,/exp/eschumacher/clel_data/LDC2019T02/SPA/trai...,m.0khp_px=Carl Pistorius (0.97),m.0v468p5=Sheila Pistorius (0.91),...,0.0,0.0,0.0,0.0,Carl Pistorius,m.0khp_px,m.0f1lfn,Oscar Leonard Carl Pistorius,26,SPA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1559,Hillary,Everything about Hillary I read or post will b...,m.0d06m5,3.0,-0.959961,198,TEDL15_TRAINING_26103,/exp/eschumacher/clel_data/LDC2019T02/ENG/trai...,m.0jrc0=edmund_hillary (0.20),m.0g99ln=USS Hillary Clinton (-0.86),...,0.0,0.0,0.0,0.0,edmund_hillary,m.0jrc0,m.0d06m5,First lady Hillary Rodham Clinton,71,ENG
1560,Hillary,"God speed Hillary, see you in the Oval Office ...",m.0d06m5,3.0,-0.729980,198,TEDL15_TRAINING_26104,/exp/eschumacher/clel_data/LDC2019T02/ENG/trai...,m.0jrc0=edmund_hillary (0.08),m.0qn_496=The_frozen_vagina (-0.44),...,0.0,0.0,0.0,0.0,edmund_hillary,m.0jrc0,m.0d06m5,First lady Hillary Rodham Clinton,71,ENG
1561,Clinton,Clinton/Castro would blow up Jeb's dream of al...,m.0d06m5,3.0,-0.945801,198,TEDL15_TRAINING_26106,/exp/eschumacher/clel_data/LDC2019T02/ENG/trai...,m.0t0yj2=Bill Clinton (-0.61),m.0d_m6=clinton_county (-0.94),...,0.0,0.0,0.0,0.0,Bill Clinton,m.0t0yj2,m.0d06m5,First lady Hillary Rodham Clinton,71,ENG
1562,Hillary,We are gonna be just fine in the Hillary Room ...,m.0d06m5,2.0,-0.950195,198,TEDL15_TRAINING_26109,/exp/eschumacher/clel_data/LDC2019T02/ENG/trai...,m.0jrc0=edmund_hillary (-0.58),m.0d06m5=First lady Hillary Rodham Clinton (-0...,...,0.0,0.0,0.0,0.0,edmund_hillary,m.0jrc0,m.0d06m5,First lady Hillary Rodham Clinton,71,ENG


In [6]:
pair_counts = changed_correct_df.groupby([ "original_cui", "popular_cui","original_name", "popular_name"]).size().reset_index().sort_values(by="popular_cui", ascending=False)
pair_counts.rename( columns={0:'pair_counts'}, inplace=True )
pair_counts.to_csv(os.path.join(fn_path, f'pair_counts.csv'), index=False,)
pair_counts

Unnamed: 0,original_cui,popular_cui,original_name,popular_name,pair_counts
313,m.0t4r17x,m.0t4r0zs,Tamerlan Tsarnaev,Dzhokhar A. Tsarnaev,27
58,m.036wjf,m.0t4r0zs,Dzhokhar Dudayev,Dzhokhar A. Tsarnaev,13
330,m.0zcpjsd,m.0t4r0zs,"Tsarnaev Trail Begins, More Bin Laden Secrets,...",Dzhokhar A. Tsarnaev,2
326,m.0y7wt38,m.0qtngg8,Ex Parte Boedel Steenkamp,Reeva Steenkamp,2
48,m.02w2j1q,m.0qtngg8,Orlando Riva Sound,Reeva Steenkamp,1
...,...,...,...,...,...
98,m.04lhllk,m.0157m,Bill,bill_clinton,1
90,m.047bhk1,m.0157m,Bill Clinton Boulevard,bill_clinton,1
0,m.012mjr,m.0157m,Bill & Melinda Gates Foundation,bill_clinton,1
144,m.063zs6l,m.011p3,Suu Kyi trespasser incidents,Aung_San_Suu_Kyi,1


In [7]:
lang_counts = changed_correct_df.groupby("lang").size().reset_index()
lang_counts



Unnamed: 0,lang,0
0,CMN,632
1,ENG,419
2,SPA,187


In [8]:
changed_correct_df= changed_correct_df.merge(pair_counts, on=["original_cui", "popular_cui"])
changed_correct_df.to_csv(os.path.join(fn_path, f'changed_correct.csv'), index=False,)




