# Similarity comparison 

In [1]:
import os,sys, tempfile, pandas as pd
sys.path.append("src")

In [2]:
working_dir =  "tempv2"
os.makedirs(working_dir, exist_ok=True)

In [3]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [4]:
bc2gm_train_file = os.path.join("tmp", "train.in")
bc2gm_test_file = os.path.join("tmp", "test.in")

bc2gm_train_eval_file = os.path.join("tmp", "trainGENE.eval")
bc2gm_test_eval_file = os.path.join("tmp", "testGENE.eval")

In [5]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()
result_score, result_detail = bc2gmrun.run_similarity_comparer("text", bc2gm_train_file,  bc2gm_test_file)
df_bc2_gm_text = scores_to_df(result_score,result_detail, "BC2GM", "text", "NER")

result_score, result_detail = bc2gmrun.run_similarity_comparer("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file)
df_bc2_gm_eval = scores_to_df(result_score,result_detail, "BC2GM", "anno", "NER")

df_bc2_gm = pd.concat([df_bc2_gm_text, df_bc2_gm_eval])

Exact matches Unigram, 39 / 5000
Exact matches Bigram, 26 / 5000
Exact matches Trigram, 26 / 5000
Exact matches Unigram, 1996 / 6331
Exact matches Bigram, 541 / 6331
Exact matches Trigram, 134 / 6331


In [6]:
df_bc2_gm.sample(n=2)

Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
6138,70.710678,0.0,0.0,"(prolactin gene, prolactin)","(prolactin gene, alkaline phosphatases)","(prolactin gene, alkaline phosphatases)",BC2GM anno,NER
2844,48.038446,21.081851,11.785113,(Total body water and distribution space of Na...,(Total body water and distribution space of Na...,(Total body water and distribution space of Na...,BC2GM text,NER


In [7]:
df_bc2_gm.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BC2GM anno,count,6331.0,6331.0,6331.0
BC2GM anno,mean,70.766285,19.552406,5.41172
BC2GM anno,std,35.975868,35.854864,20.895407
BC2GM anno,min,0.0,0.0,0.0
BC2GM anno,25%,57.735027,0.0,0.0
BC2GM anno,50%,81.649658,0.0,0.0
BC2GM anno,75%,100.0,25.819889,0.0
BC2GM anno,max,100.0,100.0,100.0
BC2GM text,count,5000.0,5000.0,5000.0
BC2GM text,mean,33.19164,13.122176,4.200074


In [8]:
df_bc2_gm_text.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Mutational analysis of yeast CEG1 demonstrated that four of the five conserved motifs are essential for capping enzyme function in vivo.',
 'Mutational analysis of yeast CEG1 demonstrated that four of the five conserved motifs are essential for capping enzyme function in vivo.')

In [9]:
df_bc2_gm_text.sort_values(by=["Trigram"], ascending=False).head(n=1)["Trigram_detail"].iloc[0]

('A heterologous promoter construct containing three repeats of a consensus Sp1 site, cloned upstream of a single copy of the ZII (CREB/ AP1) element from the BZLF1 promoter linked to the beta-globin TATA box, exhibited phorbol ester inducibility.',
 'A heterologous promoter construct containing three repeats of a consensus Sp1 site, cloned upstream of a single copy of the ZII (CREB/ AP1) element from the BZLF1 promoter linked to the beta-globin TATA box, exhibited phorbol ester inducibility.')

## 2. AIMED (Random)

In [10]:
aimed_file = os.path.join("tmp", "AIMedFull_preprocessed.json")

In [11]:
from aimed_random import AIMedRandom

result_score, result_detail = AIMedRandom().run_similarity_comparer(aimed_file)
df_aimed_random = scores_to_df(result_score,result_detail, "AIMED (R)", "", "REL")

In [12]:
df_aimed_random.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (R),count,584.0,584.0,584.0
AIMED (R),mean,96.947936,82.294414,73.146194
AIMED (R),std,11.921198,18.519621,21.285885
AIMED (R),min,27.498597,8.247861,0.0
AIMED (R),25%,100.0,77.777778,66.666667
AIMED (R),50%,100.0,86.666667,77.777778
AIMED (R),75%,100.0,93.801712,87.053315
AIMED (R),max,100.0,100.0,100.0


In [13]:
df_aimed_random.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Using the yeast two-hybrid system, a genetic assay for studying protein-protein interactions, we have examined and compared the interaction of the PROTEIN1 receptor ( PROTEIN2 ) and the PROTEIN ( PROTEIN ) with their two known substrates PROTEIN and the PROTEIN substrate-1 ( PROTEIN ).',
 'Using the yeast two-hybrid system, a genetic assay for studying protein-protein interactions, we have examined and compared the interaction of the PROTEIN1 receptor ( PROTEIN ) and the PROTEIN2 ( PROTEIN ) with their two known substrates PROTEIN and the PROTEIN substrate-1 ( PROTEIN ).')

In [14]:
df_aimed_random.sort_values(by=["Unigram"], ascending=True).head(n=1)["Unigram_detail"].iloc[0]

('While the circulating PROTEIN1 of renal origin is acutely activated to contribute to the accomplishment of short term homeostatic reactions, tissue PROTEIN2 , in paracrine or autocrine ways, exerts local-regional adaptive actions of long duration.',
 'In the end we refer to the relations between circulating PROTEIN of renal origin and the extrarenal PROTEIN1 specific to tissue underlying the fact that the two PROTEIN2 components cooperate and constitute a unitary hormonal system, regulating the major functions of the organism and maintaining the main homeostatic equilibria.')

## 3. AIMED (Unqiue Document)

In [15]:
from aimed_uniquedoc import AIMedUniqueDoc

result_score, result_detail = AIMedUniqueDoc().run_similarity_comparer(aimed_file)
df_aimed_unique = scores_to_df(result_score,result_detail, "AIMED (U)", "", "REL")

In [16]:
df_aimed_unique.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (U),count,498.0,498.0,498.0
AIMED (U),mean,67.137026,36.067096,20.770509
AIMED (U),std,15.317615,19.457675,18.481003
AIMED (U),min,28.426762,0.0,0.0
AIMED (U),25%,53.935989,21.629523,8.679295
AIMED (U),50%,68.640647,31.622777,14.629372
AIMED (U),75%,78.920519,44.72136,26.726124
AIMED (U),max,87.017654,80.489529,71.632286


In [17]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Using antibody-mediated immunofluorescence copatching of epitope-tagged receptors, we provide evidence in live cells for preexisting heteromeric ( PROTEIN / PROTEIN and PROTEIN / PROTEIN1 ) and homomeric ( PROTEIN / PROTEIN , PROTEIN / PROTEIN , PROTEIN / PROTEIN , and also PROTEIN2 / PROTEIN ) oligomers in the absence of ligand.',
 ' PROTEIN1 is a potent, tight-binding inhibitor of Cdks and can inhibit the phosphorylation of PROTEIN by PROTEIN - PROTEIN , PROTEIN - PROTEIN , PROTEIN2 - PROTEIN , and PROTEIN - PROTEIN complexes.')

In [18]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=True).head(n=1)["Unigram_detail"].iloc[0]

('In contrast, in normal nasal tissues, PROTEIN1 labeling was only found in the vascular wall, and the expression was weaker--a finding demonstrating that PROTEIN2 is upregulated in nasal polyps.',
 'We found PROTEIN1 mRNA in several peripheral tissues, and detected PROTEIN2 protein on cultured vascular endothelial cells.')

## 4. SST2 Dataset

In [19]:

sst2_train_file = os.path.join("tmp", "train.tsv")
sst2_test_file = os.path.join("tmp", "test.tsv")
sst2_sentiment_labels_file  =  os.path.join("tmp", "sentiment_labels.txt")
sst2_dictionary_file  =  os.path.join("tmp", "dictionary.txt") 



In [20]:
from sst2_dataset import SST2Dataset
result_score, result_detail =  SST2Dataset().run_similarity_comparer(sst2_train_file,sst2_test_file)
df_sst2 = scores_to_df(result_score,result_detail, "SST2", "", "CLS")

In [21]:
df_sst2.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SST2,count,1821.0,1821.0,1821.0
SST2,mean,46.055815,17.378379,1.388049
SST2,std,14.067934,18.009431,6.733712
SST2,min,0.0,0.0,0.0
SST2,25%,36.514837,0.0,0.0
SST2,50%,43.643578,16.666667,0.0
SST2,75%,53.452248,30.151134,0.0
SST2,max,100.0,100.0,70.710678


In [22]:
print(*df_sst2.sort_values(by=["Unigram"], ascending=False).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

still , i thought it could have been more .

well-thought 


In [23]:
print(*df_sst2.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

made by jackasses for jackasses .

hide new secretions from the parental units 


## 5. BC3 Article classification

In [24]:
bc3_act_train_file = os.path.join("tmp", "bc3_act_all_records.tsv")
bc3_act_test_file = os.path.join("tmp", "bc3_act_all_records_test.tsv")

In [25]:
from bc3_article_classification import BC3ArticleClassification

result_score, result_detail = BC3ArticleClassification().run_similarity_comparer(bc3_act_train_file,  bc3_act_test_file)
df_bc3_act = scores_to_df(result_score,result_detail, "BC3ACT", "", "CLS")

In [26]:
df_bc3_act.describe()

Unnamed: 0,Unigram,Bigram,Trigram
count,6000.0,6000.0,6000.0
mean,26.760271,6.907863,1.807134
std,9.325118,5.474373,1.728708
min,6.27761,0.0,0.0
25%,20.094071,3.474651,0.835709
50%,25.677218,5.298225,1.476668
75%,31.914209,8.530364,2.352489
max,75.011317,51.197969,18.973666


In [27]:
print(*df_bc3_act.sort_values(by=["Trigram"], ascending=False).head(n=1)["Trigram_detail"]
      .iloc[0], sep='\n\n')

Wnt/beta-catenin and NF-kappaB signaling mechanisms provide central controls in development and disease, but how these pathways intersect is unclear. Using hair follicle induction as a model system, we show that patterning of dermal Wnt/beta-catenin signaling requires epithelial beta-catenin activity. We find that Wnt/beta-catenin signaling is absolutely required for NF-kappaB activation, and that Edar is a direct Wnt target gene. Wnt/beta-catenin signaling is initially activated independently of EDA/EDAR/NF-kappaB activity in primary hair follicle primordia. However, Eda/Edar/NF-kappaB signaling is required to refine the pattern of Wnt/beta-catenin activity, and to maintain this activity at later stages of placode development. We show that maintenance of localized expression of Wnt10b and Wnt10a requires NF-kappaB signaling, providing a molecular explanation for the latter observation, and identify Wnt10b as a direct NF-kappaB target. These data reveal a complex interplay and interdep

In [28]:
print(*df_bc3_act.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

BACKGROUND AND AIMS: Celiac sprue is a life-long disease characterized by an intestinal inflammatory response to dietary gluten. A gluten-free diet is an effective treatment for most patients, but accidental ingestion of gluten is common, leading to incomplete recovery or relapse. Food-grade proteases capable of detoxifying moderate quantities of dietary gluten could mitigate this problem. METHODS: We evaluated the gluten detoxification properties of two food-grade enzymes, aspergillopepsin (ASP) from Aspergillus niger and dipeptidyl peptidase IV (DPPIV) from Aspergillus oryzae. The ability of each enzyme to hydrolyze gluten was tested against synthetic gluten peptides, a recombinant gluten protein, and simulated gastric digests of whole gluten and whole-wheat bread. Reaction products were analyzed by mass spectrometry, HPLC, ELISA with a monoclonal antibody that recognizes an immunodominant gluten epitope, and a T cell proliferation assay. RESULTS: ASP markedly enhanced gluten digesti

## Chemu

In [29]:
chemu_train_dir = os.path.join("tmp", "chemu", "train")
chemu_test_dir = os.path.join("tmp", "chemu", "test")

In [30]:
from chemu_gene_mention import ChemuGeneMention

In [31]:

chemurun = ChemuGeneMention()
result_score, result_detail = chemurun.run_similarity_comparer( chemu_train_dir,  chemu_test_dir, "text")
df_chemu_text = scores_to_df(result_score,result_detail, "Chemu", "text", "NER")

result_score, result_detail = chemurun.run_similarity_comparer( chemu_train_dir,  chemu_test_dir, "entity")
df_chemu_eval = scores_to_df(result_score,result_detail, "Chemu", "anno", "NER")

df_chemu = pd.concat([df_chemu_text, df_chemu_eval])

In [32]:
df_chemu.head()

Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
0,88.16057,73.775566,66.167284,(Example 55\nTo a suspension of 6-(2-amino-5-c...,(Example 55\nTo a suspension of 6-(2-amino-5-c...,(Example 55\nTo a suspension of 6-(2-amino-5-c...,Chemu text,NER
1,59.43823,33.936262,19.367281,"(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophe...","(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophe...","(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophe...",Chemu text,NER
2,79.425707,52.160029,46.756433,"(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl...","(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl...","(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl...",Chemu text,NER
3,50.937163,13.844644,7.317617,"(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-n...","(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-n...","(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-n...",Chemu text,NER
4,57.138872,18.352924,5.902813,(Example 73\nPreparation of 2-chloro-3-fluoro-...,(Example 73\nPreparation of 2-chloro-3-fluoro-...,(Example 73\nPreparation of 2-chloro-3-fluoro-...,Chemu text,NER


In [33]:
print(*df_chemu_text.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

To a solution of 5-Bromo-4-methoxy-pyridin-2-ylamine (53 g, 0.261 mol, 1.0 eq.) in EtOH:H2O=4:1 (500 mL) is added chloro-acetaldehyde (24.589 g, 0.313 mol, 1.2 eq.), then NaHCO3 (26.3 g, 0.313 mol, 1.2 eq.) is added. The resultant mixture is heated to 90° C. for 4 h. After cooling to r.t., the organic solvent is evaporated. The residue is extracted with DCM (200 mL×3). The organic layer are combined, dried over Na2SO4, filtered and concentrated. The crude product is purified by silica gel chromatography (DCM:MeOH=50:1) to afford compound 6-Bromo-7-methoxy-imidazo[1,2-a]pyridine (39 g, 66%) as a brown solid.

Step 1. N-(5-bromo-2-fluoropyridin-3-yl)ethanesulfonamide
To a solution of 5-bromo-2-fluoropyridin-3-amine (3 g, 20 mmol) in pyridine (10 mL) and DCM (20 mL) at room temperature was added ethanesulfonyl chloride (2.2 mL, 24 mmol). After stirring for 30 min, the solvent was evaporated. The resulting residue was diluted with MeOH (4 mL) and partitioned between EtOAc and brine. The or

In [34]:
print(*df_chemu_text.sort_values(by=["Unigram"], ascending=False).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

EXAMPLE 6
5-(3-methoxyphenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (reaction scheme 1, compound 6)
Similarly to example 1.8, starting from 71 mg (0.5 mmol) of 3-methoxyphenylboronic acid and 150 mg (0.3 mmol) of 5-bromo-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (prepared as described in example 2.5), 120 mg (77%) of 5-(3-methoxyphenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione is obtained in the form of a white solid with a melting point of 251° C.

EXAMPLE 4
5-(3-fluorophenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (reaction scheme 1, compound 4)
Similarly to example 1.8, starting from 66 mg (0.5 mmol) of 3-fluorophenylboronic acid and 150 mg (0.3 mmol) of 5-bromo-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quina

## Summary

In [35]:
df_summary = pd.concat([df_sst2, df_aimed_unique,df_aimed_random, df_bc2_gm, df_bc3_act,df_chemu ])

In [36]:
df_summary.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (R),count,584.000000,584.000000,584.000000
AIMED (R),mean,96.947936,82.294414,73.146194
AIMED (R),std,11.921198,18.519621,21.285885
AIMED (R),min,27.498597,8.247861,0.000000
AIMED (R),25%,100.000000,77.777778,66.666667
...,...,...,...,...
SST2,min,0.000000,0.000000,0.000000
SST2,25%,36.514837,0.000000,0.000000
SST2,50%,43.643578,16.666667,0.000000
SST2,75%,53.452248,30.151134,0.000000


In [37]:
df_summary.groupby(["task_type","data_set"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
task_type,data_set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CLS,BC3ACT,26.760271,6.907863,1.807134
CLS,SST2,46.055815,17.378379,1.388049
NER,BC2GM anno,70.766285,19.552406,5.41172
NER,BC2GM text,33.19164,13.122176,4.200074
NER,Chemu anno,84.286901,30.670974,6.832106
NER,Chemu text,68.446277,42.391903,31.631991
REL,AIMED (R),96.947936,82.294414,73.146194
REL,AIMED (U),67.137026,36.067096,20.770509


In [38]:
print(df_summary.groupby([ "data_set","task_type"]).mean().
      to_latex(float_format=lambda x: "{:.2f}".format(x)))

\begin{tabular}{llrrr}
\toprule
      &     &  Unigram &  Bigram &  Trigram \\
data\_set & task\_type &          &         &          \\
\midrule
AIMED (R)  & REL &    96.95 &   82.29 &    73.15 \\
AIMED (U)  & REL &    67.14 &   36.07 &    20.77 \\
BC2GM anno & NER &    70.77 &   19.55 &     5.41 \\
BC2GM text & NER &    33.19 &   13.12 &     4.20 \\
BC3ACT  & CLS &    26.76 &    6.91 &     1.81 \\
Chemu anno & NER &    84.29 &   30.67 &     6.83 \\
Chemu text & NER &    68.45 &   42.39 &    31.63 \\
SST2  & CLS &    46.06 &   17.38 &     1.39 \\
\bottomrule
\end{tabular}

