In [1]:
from rdflib import Namespace, Graph, URIRef, Literal
# Import pandas package

import pandas as pd
from difflib import SequenceMatcher

In [68]:
# change the cell width

pd.set_option("display.width", 120)

# Recreating the graphs from the xml files

In [70]:

medextractor = []
manual = []

# claustrophobia and stress processed twice; once included in diseases training vocabulary and once not
diseases = ("depression", "anxietydisorders", "bulimia", "claustrophobia","claustrophobia2", "dementia", "eatingdisorders", "panicdisorder", "psychosis", "socialanxiety", "stress","stress2")
# diseases = []
# diseases.append("depression")
namespace_string = "http://fapranlp.de/"
                    
for disease in diseases:

    g_medextractor = Graph()
    g_manual = Graph()

    g_medextractor.parse(f"../MedExtractor/resources/evaluation_rdfs/{disease}.xml")
    g_manual.parse(f"../MedExtractor/resources/manual_evaluation_rdfs/{disease}.xml")
    medextractor.append(g_medextractor)
    manual.append(g_manual)


# Loading the symptoms into dataframes

In [71]:
dfs_medextractor = []
dfs_manual = []

data = {'Disorder': [],
    'Relationship': [],
    'Symptom': []}
  

for medextractor_graph in medextractor:
    df_medextractor = pd.DataFrame(data)
    for s, p, o in medextractor_graph:
        d3 = {"Disorder": [s.replace(namespace_string,'')], 
              "Relationship": [p.replace(namespace_string,'')],
              "Symptom": [o.replace(namespace_string+"symptom:",'').replace('_',' ')]}
        df3 = pd.DataFrame(d3)
        df_medextractor = pd.concat([df_medextractor, df3],  ignore_index=True)
    dfs_medextractor.append(df_medextractor)

for manual_graph in manual: 
    df_manual = pd.DataFrame(data)
    for s, p, o in manual_graph:
        d2 = {"Disorder": [s.replace(namespace_string,'')], 
              "Relationship": [p.replace(namespace_string,'')],
              "Symptom": [o.replace(namespace_string+"symptom:",'').replace('_',' ')]}
        df2 = pd.DataFrame(d2)
        df_manual = pd.concat([df_manual, df2],  ignore_index=True)
    dfs_manual.append(df_manual)



true positive (TP) A test result that correctly indicates the presence of a condition or characteristic 
- medextractor found a symptom that is among the manually identified symptoms

true negative (TN) A test result that correctly indicates the absence of a condition or characteristic
- not relevant here

false positive (FP) A test result which wrongly indicates that a particular condition or attribute is present
- medextractor found a symptom that is not among the manually identified symptoms

false negative (FN) A test result which wrongly indicates that a particular condition or attribute is absent
- medextractor did not find a symptom that is among the manually identified symptoms

Precision = TP / (TP + FP)


Recall = TP / (TP + FN)


# Finding which manually gathered symptoms the MedExtractor symptoms match

In [72]:
data = {'Symptom_Medextractor': [],
    'Original_Symptom': []}
original_symptom = "Original_Symptom"

medextractor_in_manual = []


for df_medextractor, df_manual in zip(dfs_medextractor, dfs_manual):
    df_result = pd.DataFrame(data)
    for index, row in df_medextractor.iterrows(): 
        d_new = {"Symptom_Medextractor": [row['Symptom']], 
              original_symptom: ['']}
        dfnew = pd.DataFrame(d_new)
        df_result = pd.concat([df_result, dfnew],  ignore_index=True)

        for x in df_manual['Symptom']:
            
            if row['Symptom'] in x:
                if df_result.loc[index, original_symptom] == "":
                    df_result.loc[index, original_symptom] = x
                else:
                    df_result.loc[index, original_symptom] = df_result.loc[index, original_symptom] + ";" + x
    medextractor_in_manual.append(df_result)



# Finding which manually gathered symptoms where found by MedExtractor
and counting them

In [87]:
data = {'Symptom_manual': [],
    'Symptom_found': []}
sizes_data = {'Text': [],
               'No. entries manual': [],
               'No. entries Medextractor': []}
               

symptom_found = "Symptom_found"

manual_symptoms_found_by_medextractor = []
df_sizes = pd.DataFrame(sizes_data)

for disease, df_medextractor, df_manual in zip(diseases, dfs_medextractor, dfs_manual):
    df_result = pd.DataFrame(data)
    
    for index, row in df_manual.iterrows(): 
        d_new = {"Symptom_manual": [row['Symptom']], 
              symptom_found: ['']}
        dfnew = pd.DataFrame(d_new)
        df_result = pd.concat([df_result, dfnew],  ignore_index=True)
                                                  

        for x in df_medextractor['Symptom']:
            if x in row['Symptom']:
                if df_result.loc[index, symptom_found] == "":
                    df_result.loc[index, symptom_found] = x
                else:
                    df_result.loc[index,symptom_found] = df_result.loc[index, symptom_found] + ";" + x
    manual_symptoms_found_by_medextractor.append(df_result)
    size_new = {'Text': [disease],
               # 'No. entries manual': ['{:.0f}'.format(df_manual.shape[0])],
               # 'No. entries Medextractor': ['{:.0f}'.format(df_medextractor.shape[0])]}
                'No. entries manual': [df_manual.shape[0]],
               'No. entries Medextractor': [df_medextractor.shape[0]]}
    size_result = pd.DataFrame(size_new)
    df_sizes = pd.concat([df_sizes, size_result],ignore_index=True)

 

In [74]:
print(df_sizes)

                Text No. entries manual No. entries Medextractor
0         depression                 27                       20
1   anxietydisorders                 42                       16
2            bulimia                 25                        5
3     claustrophobia                 37                        0
4    claustrophobia2                 37                       42
5           dementia                 21                       15
6    eatingdisorders                 41                        5
7      panicdisorder                 24                        5
8          psychosis                 22                        5
9      socialanxiety                 16                        3
10            stress                 18                        0
11           stress2                 18                        1


# Creating statistics over all files

In [88]:
result_data = {'Text': [],
    'Precision': [],
    'Recall':[]}

statistics_data = pd.DataFrame(result_data)


for disease, df_result_medextractor_in_manual, df_result_manual_symptoms_found_by_medextractor in zip(diseases, medextractor_in_manual, manual_symptoms_found_by_medextractor):

    tp = (df_result_medextractor_in_manual[original_symptom] != '').sum()
    fp = (df_result_medextractor_in_manual[original_symptom] == '').sum()
    if (tp + fp) != 0:
        precision = tp / (tp + fp)
    else:
        precision = 0
    fn = (df_result_manual_symptoms_found_by_medextractor[symptom_found] == '').sum()
    tp2 = (df_result_manual_symptoms_found_by_medextractor[symptom_found] != '').sum()
    if (tp2 + fp) != 0:
        recall = tp2 / (tp2 + fn)
    else:
        recall = 0
    print(f"{disease}: tp1: {tp}, fp: {fp} + fn: {fn}, tp2: {tp2}. precision: {precision}, recall: {recall}")
    d_new = {'Text': [disease],
        # 'Precision': ['{0:.3f}'.format(precision)],
        # 'Recall':['{0:.3f}'.format(recall)]}
     'Precision': [precision],
        'Recall':[recall]}
    dfnew = pd.DataFrame(d_new)
    statistics_data = pd.concat([statistics_data, dfnew],  ignore_index=True)

depression: tp1: 18, fp: 2 + fn: 13, tp2: 14. precision: 0.9, recall: 0.5185185185185185
anxietydisorders: tp1: 8, fp: 8 + fn: 33, tp2: 9. precision: 0.5, recall: 0.21428571428571427
bulimia: tp1: 4, fp: 1 + fn: 21, tp2: 4. precision: 0.8, recall: 0.16
claustrophobia: tp1: 0, fp: 0 + fn: 37, tp2: 0. precision: 0, recall: 0
claustrophobia2: tp1: 38, fp: 4 + fn: 17, tp2: 20. precision: 0.9047619047619048, recall: 0.5405405405405406
dementia: tp1: 4, fp: 11 + fn: 18, tp2: 3. precision: 0.26666666666666666, recall: 0.14285714285714285
eatingdisorders: tp1: 3, fp: 2 + fn: 39, tp2: 2. precision: 0.6, recall: 0.04878048780487805
panicdisorder: tp1: 4, fp: 1 + fn: 20, tp2: 4. precision: 0.8, recall: 0.16666666666666666
psychosis: tp1: 5, fp: 0 + fn: 18, tp2: 4. precision: 1.0, recall: 0.18181818181818182
socialanxiety: tp1: 0, fp: 3 + fn: 16, tp2: 0. precision: 0.0, recall: 0.0
stress: tp1: 0, fp: 0 + fn: 18, tp2: 0. precision: 0, recall: 0
stress2: tp1: 0, fp: 1 + fn: 18, tp2: 0. precision: 0

In [89]:
all = pd.concat([df_sizes, statistics_data[['Precision','Recall']]], axis=1)

In [83]:
print(all)

                Text No. entries manual No. entries Medextractor Precision Recall
0         depression                 27                       20     0.900  0.519
1   anxietydisorders                 42                       16     0.500  0.214
2            bulimia                 25                        5     0.800  0.160
3     claustrophobia                 37                        0     0.000  0.000
4    claustrophobia2                 37                       42     0.905  0.541
5           dementia                 21                       15     0.267  0.143
6    eatingdisorders                 41                        5     0.600  0.049
7      panicdisorder                 24                        5     0.800  0.167
8          psychosis                 22                        5     1.000  0.182
9      socialanxiety                 16                        3     0.000  0.000
10            stress                 18                        0     0.000  0.000
11           str

In [84]:
print(all.to_latex(index=False))  

\begin{tabular}{lllll}
\toprule
            Text & No. entries manual & No. entries Medextractor & Precision & Recall \\
\midrule
      depression &                 27 &                       20 &     0.900 &  0.519 \\
anxietydisorders &                 42 &                       16 &     0.500 &  0.214 \\
         bulimia &                 25 &                        5 &     0.800 &  0.160 \\
  claustrophobia &                 37 &                        0 &     0.000 &  0.000 \\
 claustrophobia2 &                 37 &                       42 &     0.905 &  0.541 \\
        dementia &                 21 &                       15 &     0.267 &  0.143 \\
 eatingdisorders &                 41 &                        5 &     0.600 &  0.049 \\
   panicdisorder &                 24 &                        5 &     0.800 &  0.167 \\
       psychosis &                 22 &                        5 &     1.000 &  0.182 \\
   socialanxiety &                 16 &                        3 &   

In [91]:
print(all.mean())

No. entries manual          27.333333
No. entries Medextractor     9.750000
Precision                    0.480952
Recall                       0.164456
dtype: float64


  """Entry point for launching an IPython kernel.


In [92]:
print(all.mean().to_latex(index=False))  

\begin{tabular}{r}
\toprule
        0 \\
\midrule
27.333333 \\
 9.750000 \\
 0.480952 \\
 0.164456 \\
\bottomrule
\end{tabular}



  """Entry point for launching an IPython kernel.


# Creating tables over all results

In [95]:

for disease, df_result_medextractor_in_manual, df_result_manual_symptoms_found_by_medextractor in zip(diseases, medextractor_in_manual, manual_symptoms_found_by_medextractor):
    print(disease)
    print(df_result_medextractor_in_manual.to_latex(index=False))  
    print(df_result_manual_symptoms_found_by_medextractor.to_latex(index=False))  

depression
\begin{tabular}{ll}
\toprule
  Symptom\_Medextractor &                                   Original\_Symptom \\
\midrule
             interests &              neglecting your hobbies and interests \\
               worried &                         feeling anxious or worried \\
            neglecting &              neglecting your hobbies and interests \\
                 aches &                        unexplained aches and pains \\
       low self esteem &                                                    \\
              low mood &                                continuous low mood \\
              interest & neglecting your hobbies and interests;having no... \\
         low sex drive &                     low sex drive (loss of libido) \\
        loss of libido &                     low sex drive (loss of libido) \\
   changes in appetite &                      changes in appetite or weight \\
                 pains &                        unexplained aches and pains \\
 