# Analyse Similarity  - BC2GM

In [1]:
import os,sys, tempfile, pandas as pd, shutil, logging, glob
sys.path.append("src")

In [2]:
base_data_dir = "tmp"

In [3]:
working_dir =  "tempspliiterhajkA"
if os.path.exists(working_dir) : shutil.rmtree(working_dir)
os.makedirs(working_dir, exist_ok=True)

In [4]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

In [5]:
logging.basicConfig(level="WARN", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [6]:
thresholds = [0,25,50,75,100]
ngrams = [1]
num_parts = 4

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [7]:
bc2gm_train_file = os.path.join(base_data_dir, "train.in")
bc2gm_test_file = os.path.join(base_data_dir, "test.in")

bc2gm_train_eval_file = os.path.join(base_data_dir, "trainGENE.eval")
bc2gm_test_eval_file = os.path.join(base_data_dir, "testGENE.eval")

bc2gm_test_alteval_file = os.path.join(base_data_dir, "testALTGENE.eval")

# This is the predictions
bc2gm_predictions_file = os.path.join(base_data_dir, "bc2gm_result_test_pred.txt")

In [8]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()



### BC2GM : Threshold based simlarity

In [9]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_summary, result_details = bc2gmrun.run_similarity_threshold_splitter("text", bc2gm_train_file,  bc2gm_test_file, bc2gm_out_dir_text, bc2gm_test_eval_file, bc2gm_test_alteval_file,bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_text = pd.DataFrame(result_summary)
df_bc2_gm_text["dataset"] = "BC2GM (text)"



In [10]:
df_bc2_gm_text.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25.0,990,19.8,0.744872,0.688389,0.811453,BC2GM (text)
1,1,25,50.0,3706,74.12,0.823754,0.782895,0.869114,BC2GM (text)
2,1,50,75.0,257,5.14,0.870801,0.838308,0.905914,BC2GM (text)
3,1,75,,47,0.94,0.78481,0.794872,0.775,BC2GM (text)
4,-1,0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


In [11]:
part_4_df = result_details[-1]
part_4_df

Unnamed: 0,text,docid,raw
14,Biol.,BC2GM000365921,BC2GM000365921 Biol.\n
15,Copyright 2000 Academic Press.,BC2GM000462914,BC2GM000462914 Copyright 2000 Academic Press.\n
223,1988).,BC2GM004818954,BC2GM004818954 1988).\n
255,Coronary vasoconstriction caused by endothelin...,BC2GM005484781,BC2GM005484781 Coronary vasoconstriction cause...
416,We also show that in fusions with the DNA bind...,BC2GM008579133,BC2GM008579133 We also show that in fusions wi...
429,Biol.,BC2GM008918951,BC2GM008918951 Biol.\n
445,Acad.,BC2GM009274132,BC2GM009274132 Acad.\n
568,A heterologous promoter construct containing t...,BC2GM011571896,BC2GM011571896 A heterologous promoter constru...
650,Virol.,BC2GM013081581,BC2GM013081581 Virol.\n
748,1995.,BC2GM014978613,BC2GM014978613 1995.\n


In [12]:
pred_df = bc2gmrun._split_predictions(part_4_df, bc2gm_predictions_file)
altgene_df = bc2gmrun._split_predictions(part_4_df, bc2gm_test_alteval_file)
gene_df = bc2gmrun._split_predictions(part_4_df, bc2gm_test_eval_file)


In [13]:
pred_df["pos"]=pred_df["raw"].apply(lambda x: x.split("|")[1])
pred_df.sort_values(by=["docid", "text"])

Unnamed: 0,text,raw,docid,pos
362,endothelin - 1,BC2GM005484781|32 43|endothelin - 1\n,BC2GM005484781,32 43
588,BHV - alpha TIF,BC2GM008579133|82 93|BHV - alpha TIF\n,BC2GM008579133,82 93
587,GAL4,BC2GM008579133|48 51|GAL4\n,BC2GM008579133,48 51
758,AP1,BC2GM011571896|113 115|AP1\n,BC2GM011571896,113 115
759,BZLF1 promoter,BC2GM011571896|131 143|BZLF1 promoter\n,BC2GM011571896,131 143
757,CREB,BC2GM011571896|108 111|CREB\n,BC2GM011571896,108 111
755,Sp1 site,BC2GM011571896|64 70|Sp1 site\n,BC2GM011571896,64 70
756,ZII,BC2GM011571896|104 106|ZII\n,BC2GM011571896,104 106
760,beta - globin,BC2GM011571896|155 165|beta - globin\n,BC2GM011571896,155 165
1310,protein S,BC2GM020782530|4 11|protein S\n,BC2GM020782530,4 11


In [14]:
gene_df["pos"]=gene_df["raw"].apply(lambda x: x.split("|")[1])

gene_df.sort_values(by=["docid", "text"])

Unnamed: 0,text,raw,docid,pos
1724,endothelin-1,BC2GM005484781|32 43|endothelin-1\n,BC2GM005484781,32 43
1513,BHV-alpha TIF,BC2GM008579133|82 93|BHV-alpha TIF\n,BC2GM008579133,82 93
1512,GAL4,BC2GM008579133|48 51|GAL4\n,BC2GM008579133,48 51
3231,AP1,BC2GM011571896|113 115|AP1\n,BC2GM011571896,113 115
3232,BZLF1 promoter,BC2GM011571896|131 143|BZLF1 promoter\n,BC2GM011571896,131 143
3230,CREB,BC2GM011571896|108 111|CREB\n,BC2GM011571896,108 111
3228,Sp1 site,BC2GM011571896|64 70|Sp1 site\n,BC2GM011571896,64 70
3229,ZII,BC2GM011571896|104 106|ZII\n,BC2GM011571896,104 106
3233,beta-globin,BC2GM011571896|155 165|beta-globin\n,BC2GM011571896,155 165
2418,protein S,BC2GM020782530|4 11|protein S\n,BC2GM020782530,4 11


In [15]:
full_df = gene_df.merge(pred_df, on=["docid", "pos"], how="outer", suffixes=["_gt", "_pred"]).merge(part_4_df, on=["docid"])

In [16]:
mismatch_df = full_df [ full_df.text_pred.isna()|full_df.text_gt.isna() ].sort_values(by=["docid"])

In [17]:
mismatch_df.shape

(19, 8)

In [18]:
mismatch_df[mismatch_df.text_gt.isna() ]

Unnamed: 0,text_gt,raw_gt,docid,pos,text_pred,raw_pred,text,raw
35,,,BC2GM025094273,3 11,activated,BC2GM025094273|3 11|activated\n,The activated glucocorticoid receptor forms a ...,BC2GM025094273 The activated glucocorticoid re...
48,,,BC2GM038263382,14 33,type I mechanoreceptor,BC2GM038263382|14 33|type I mechanoreceptor\n,Slowly adapting type I mechanoreceptor dischar...,BC2GM038263382 Slowly adapting type I mechanor...
45,,,BC2GM045755791,68 83,mitogenic pathway,BC2GM045755791|68 83|mitogenic pathway\n,The recruitment of constitutively phosphorylat...,BC2GM045755791 The recruitment of constitutive...
47,,,BC2GM046414753,145 172,human IL - 4 cytokine gene cluster,BC2GM046414753|145 172|human IL - 4 cytokine g...,Analysis of 1 Mb of published sequence from th...,BC2GM046414753 Analysis of 1 Mb of published s...
42,,,BC2GM047470331,114 116,FP6,BC2GM047470331|114 116|FP6\n,The human SHBG proximal promoter was analyzed ...,BC2GM047470331 The human SHBG proximal promote...
41,,,BC2GM047470331,110 112,FP1,BC2GM047470331|110 112|FP1\n,The human SHBG proximal promoter was analyzed ...,BC2GM047470331 The human SHBG proximal promote...
4,,,BC2GM057069955,4 7,dial,BC2GM057069955|4 7|dial\n,Non-dialyzable transfer factor,BC2GM057069955 Non-dialyzable transfer factor\n
30,,,BC2GM066709301,159 160,HU,BC2GM066709301|159 160|HU\n,"Although RAD17, RAD24 and MEC3 are not require...","BC2GM066709301 Although RAD17, RAD24 and MEC3 ..."
31,,,BC2GM066709301,206 207,HU,BC2GM066709301|206 207|HU\n,"Although RAD17, RAD24 and MEC3 are not require...","BC2GM066709301 Although RAD17, RAD24 and MEC3 ..."


In [19]:
mismatch_df[mismatch_df.text_pred.isna() ]

Unnamed: 0,text_gt,raw_gt,docid,pos,text_pred,raw_pred,text,raw
9,capping enzyme,BC2GM045048834|88 100|capping enzyme\n,BC2GM045048834,88 100,,,Mutational analysis of yeast CEG1 demonstrated...,BC2GM045048834 Mutational analysis of yeast CE...
46,human IL-4 cytokine gene,BC2GM046414753|145 165|human IL-4 cytokine gene\n,BC2GM046414753,145 165,,,Analysis of 1 Mb of published sequence from th...,BC2GM046414753 Analysis of 1 Mb of published s...
10,LH,BC2GM058378997|1 2|LH\n,BC2GM058378997,1 2,,,"(LH P < 0.05, LH/FSH P < 0.01).","BC2GM058378997 (LH P < 0.05, LH/FSH P < 0.01).\n"
11,LH,BC2GM058378997|10 11|LH\n,BC2GM058378997,10 11,,,"(LH P < 0.05, LH/FSH P < 0.01).","BC2GM058378997 (LH P < 0.05, LH/FSH P < 0.01).\n"
12,FSH,BC2GM058378997|13 15|FSH\n,BC2GM058378997,13 15,,,"(LH P < 0.05, LH/FSH P < 0.01).","BC2GM058378997 (LH P < 0.05, LH/FSH P < 0.01).\n"
0,Urease,BC2GM063249278|0 5|Urease\n,BC2GM063249278,0 5,,,"Urease activity, judged as the amount of ammon...","BC2GM063249278 Urease activity, judged as the ..."
1,Jack bean meal urease,BC2GM063249278|101 118|Jack bean meal urease\n,BC2GM063249278,101 118,,,"Urease activity, judged as the amount of ammon...","BC2GM063249278 Urease activity, judged as the ..."
15,cAMP-CRP,BC2GM069445885|117 124|cAMP-CRP\n,BC2GM069445885,117 124,,,The reconstituted RNA polymerases containing t...,BC2GM069445885 The reconstituted RNA polymeras...
36,HIV-2 nef,BC2GM087325432|51 58|HIV-2 nef\n,BC2GM087325432,51 58,,,These results demonstrate a specific associati...,BC2GM087325432 These results demonstrate a spe...
37,HIV-1 nef,BC2GM087325432|66 73|HIV-1 nef\n,BC2GM087325432,66 73,,,These results demonstrate a specific associati...,BC2GM087325432 These results demonstrate a spe...


In [20]:
bc2gm_out_dir_anno =  os.path.join(working_dir, "bc2gm_anno")
os.makedirs(bc2gm_out_dir_anno, exist_ok=True)
result_detail = bc2gmrun.run_similarity_threshold_splitter("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file, bc2gm_out_dir_anno, bc2gm_test_eval_file, bc2gm_test_alteval_file, bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_anno = pd.DataFrame(result_detail)
df_bc2_gm_anno["dataset"] = "BC2GM (anno)"

In [21]:
df_bc2_gm_anno.head()

Unnamed: 0,0,1,2,3,4,dataset
0,"{'ngram': 1, 'min': 0, 'max': 25, 'num': 1059,...","{'ngram': 1, 'min': 25, 'max': 50, 'num': 352,...","{'ngram': 1, 'min': 50, 'max': 75, 'num': 1565...","{'ngram': 1, 'min': 75, 'max': None, 'num': 33...","{'ngram': -1, 'min': 0, 'max': 100, 'num': 633...",BC2GM (anno)
1,text ...,text ...,text \ 5 ...,text \ 1 g...,,BC2GM (anno)


In [22]:
pd.set_option('display.max_colwidth', -1)

print(mismatch_df[mismatch_df.text_pred.isna()].sort_values(by=["docid"])
      .to_latex( columns=[ "text_gt","pos","text"], index=False)
      .replace("(text)","")
      .replace("nan","-")
     )
        

\begin{tabular}{lll}
\toprule
                  text\_gt &      pos &                                                                                                                                                                                                           text \\
\midrule
 capping enzyme &  88 100 &  Mutational analysis of yeast CEG1 demonstrated that four of the five conserved motifs are essential for capping enzyme function in vivo. \\
 human IL-4 cytokine gene &  145 165 &  Analysis of 1 Mb of published sequence from the region of conserved synteny on human chromosome 5q31-q33 identified 45 gene candidates, including 35 expressed genes in the human IL-4 cytokine gene cluster. \\
 LH &  1 2 &  (LH P < 0.05, LH/FSH P < 0.01). \\
 LH &  10 11 &  (LH P < 0.05, LH/FSH P < 0.01). \\
 FSH &  13 15 &  (LH P < 0.05, LH/FSH P < 0.01). \\
 Urease &  0 5 &  Urease activity, judged as the amount of ammonia production from urea, could be measured at 25 ng per tube (S/N = 1.5) with

  """Entry point for launching an IPython kernel.
