# Similarity comparison 

In [1]:
import os,sys, tempfile, pandas as pd
sys.path.append("src")

In [2]:
working_dir =  "tempv2"
os.makedirs(working_dir, exist_ok=True)

In [3]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

In [4]:
base_data_dir="tmp"

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [5]:
bc2gm_train_file = os.path.join(base_data_dir, "train.in")
bc2gm_test_file = os.path.join(base_data_dir, "test.in")

bc2gm_train_eval_file = os.path.join(base_data_dir, "trainGENE.eval")
bc2gm_test_eval_file = os.path.join(base_data_dir, "testGENE.eval")

In [6]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()
result_score, result_detail = bc2gmrun.run_similarity_comparer("text", bc2gm_train_file,  bc2gm_test_file)
df_bc2_gm_text = scores_to_df(result_score,result_detail, "BC2GM", "text", "NER")

result_score, result_detail = bc2gmrun.run_similarity_comparer("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file)
df_bc2_gm_eval = scores_to_df(result_score,result_detail, "BC2GM", "anno", "NER")

df_bc2_gm = pd.concat([df_bc2_gm_text, df_bc2_gm_eval])

Exact matches Unigram, 39 / 5000
Exact matches Bigram, 26 / 5000
Exact matches Trigram, 26 / 5000
Exact matches Unigram, 1996 / 6331
Exact matches Bigram, 541 / 6331
Exact matches Trigram, 134 / 6331


In [7]:
df_bc2_gm.sample(n=2)

Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
4229,44.72136,31.524416,18.257419,(This polypeptide includes the first three zin...,(This polypeptide includes the first three zin...,(This polypeptide includes the first three zin...,BC2GM text,NER
3052,21.821789,8.333333,0.0,(Serum amyloid A (SAA) is a plasma protein whi...,(Serum amyloid A (SAA) is a plasma protein whi...,(Serum amyloid A (SAA) is a plasma protein whi...,BC2GM text,NER


In [8]:
df_bc2_gm.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BC2GM anno,count,6331.0,6331.0,6331.0
BC2GM anno,mean,70.766285,19.552406,5.41172
BC2GM anno,std,35.975868,35.854864,20.895407
BC2GM anno,min,0.0,0.0,0.0
BC2GM anno,25%,57.735027,0.0,0.0
BC2GM anno,50%,81.649658,0.0,0.0
BC2GM anno,75%,100.0,25.819889,0.0
BC2GM anno,max,100.0,100.0,100.0
BC2GM text,count,5000.0,5000.0,5000.0
BC2GM text,mean,33.19164,13.122176,4.200074


In [9]:
df_bc2_gm_eval.sort_values(by=["Unigram"], ascending=False)[["Unigram", "Unigram_detail"]].head(n=20)

Unnamed: 0,Unigram,Unigram_detail
2708,100.0,"(RNA polymerase II, RNA polymerase II)"
4326,100.0,"(protein tyrosine kinase, protein tyrosine kin..."
6070,100.0,"(Src tyrosine kinases, src tyrosine kinases)"
6069,100.0,"(cyclin-dependent kinase 2, Cyclin-dependent k..."
4648,100.0,"(signal recognition particle, signal recogniti..."
1204,100.0,"(type III collagen, type III collagen)"
565,100.0,"(hepatitis B surface antigen, hepatitis B surf..."
2891,100.0,"(prostate-specific antigen, Prostate specific ..."
5712,100.0,"(nitric oxide synthase, nitric oxide synthase)"
1987,100.0,"(bacterial chloramphenicol acetyltransferase, ..."


In [10]:
df_bc2_gm_eval.sort_values(by=["Unigram"], ascending=True)[["Unigram", "Unigram_detail"]].head(n=10)

Unnamed: 0,Unigram,Unigram_detail
0,0.0,"(bradykinin, alkaline phosphatases)"
4743,0.0,"(UD3, alkaline phosphatases)"
4747,0.0,"(thiolase, alkaline phosphatases)"
4749,0.0,"(TBPH, alkaline phosphatases)"
4751,0.0,"(TM4SF, alkaline phosphatases)"
4753,0.0,"(maleless, alkaline phosphatases)"
4755,0.0,"(CAM5.2, alkaline phosphatases)"
4763,0.0,"(ICP22, alkaline phosphatases)"
1572,0.0,"(FKB2, alkaline phosphatases)"
1571,0.0,"(GRP94, alkaline phosphatases)"


In [11]:
pd.set_option('display.max_colwidth', -1)
df_bc2_gm_text_high_sim = df_bc2_gm_text.query("Unigram > 75")[["Unigram", "Unigram_detail"]]
df_bc2_gm_text_high_sim["Test"] = df_bc2_gm_text_high_sim["Unigram_detail"].apply(lambda x:x[0])
df_bc2_gm_text_high_sim["Train"] = df_bc2_gm_text_high_sim["Unigram_detail"].apply(lambda x:x[1])
df_bc2_gm_text_high_sim[["Unigram", "Train", "Test"]]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Unigram,Train,Test
14,100.0,Biol.,Biol.
15,100.0,Copyright 2000 Academic Press.,Copyright 2000 Academic Press.
223,100.0,(1988) J.,1988).
255,100.0,Coronary vasoconstriction caused by endothelin-1 is enhanced by ischemia-reperfusion and by norepinephrine present in concentrations typically observed after neonatal cardiopulmonary bypass.,Coronary vasoconstriction caused by endothelin-1 is enhanced by ischemia-reperfusion and by norepinephrine present in concentrations typically observed after neonatal cardiopulmonary bypass.
416,100.0,"We also show that in fusions with the DNA binding domain of GAL4, full activity requires the entire BHV-alpha TIF, although both amino and carboxyl termini display some activity on their own.","We also show that in fusions with the DNA binding domain of GAL4, full activity requires the entire BHV-alpha TIF, although both amino and carboxyl termini display some activity on their own."
429,100.0,Biol.,Biol.
445,100.0,Acad.,Acad.
568,100.0,"A heterologous promoter construct containing three repeats of a consensus Sp1 site, cloned upstream of a single copy of the ZII (CREB/ AP1) element from the BZLF1 promoter linked to the beta-globin TATA box, exhibited phorbol ester inducibility.","A heterologous promoter construct containing three repeats of a consensus Sp1 site, cloned upstream of a single copy of the ZII (CREB/ AP1) element from the BZLF1 promoter linked to the beta-globin TATA box, exhibited phorbol ester inducibility."
650,100.0,Virol.,Virol.
748,100.0,(1995) J.,1995.


In [12]:
df_bc2_gm_text_high_sim.describe()

Unnamed: 0,Unigram
count,47.0
mean,97.705854
std,6.231332
min,76.454959
25%,100.0
50%,100.0
75%,100.0
max,100.0


In [13]:
print(df_bc2_gm_text_high_sim[["Unigram",  "Test","Train",]].sort_values(by="Unigram").to_latex(index=False, float_format=lambda x: "{:.2f}".format(x)
))

\begin{tabular}{rll}
\toprule
 Unigram &                                                                                                                                                                                                                                                                                                        Test &                                                                                                                                                                                                                                                                                                                               Train \\
\midrule
76.45 &  Histological and immunophenotypic studies revealed 12 large cell lymphomas (11 B cell and one T cell), two small noncleaved cell lymphomas (B-cell phenotype), and five low grade B-cell lymphomas (two small lymphocytic and three follicular mixed lymphomas). &  The cases included 35 de novo diffuse aggressive lym

## 2. AIMED (Random)

In [14]:
aimed_file = os.path.join(base_data_dir, "AIMedFull_preprocessed.json")

In [15]:
from aimed_random import AIMedRandom

result_score, result_detail = AIMedRandom().run_similarity_comparer(aimed_file)
df_aimed_random = scores_to_df(result_score,result_detail, "AIMED (R)", "", "REL")

In [16]:
df_aimed_random.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (R),count,584.0,584.0,584.0
AIMED (R),mean,96.947936,82.294414,73.146194
AIMED (R),std,11.921198,18.519621,21.285885
AIMED (R),min,27.498597,8.247861,0.0
AIMED (R),25%,100.0,77.777778,66.666667
AIMED (R),50%,100.0,86.666667,77.777778
AIMED (R),75%,100.0,93.801712,87.053315
AIMED (R),max,100.0,100.0,100.0


In [17]:
df_aimed_random.sort_values(by=["Unigram"], ascending=False)[["Unigram", "Unigram_detail"]].head(n=5)

Unnamed: 0,Unigram,Unigram_detail
212,100.0,"(Using the yeast two-hybrid system, a genetic assay for studying protein-protein interactions, we have examined and compared the interaction of the PROTEIN1 receptor ( PROTEIN2 ) and the PROTEIN ( PROTEIN ) with their two known substrates PROTEIN and the PROTEIN substrate-1 ( PROTEIN )., Using the yeast two-hybrid system, a genetic assay for studying protein-protein interactions, we have examined and compared the interaction of the PROTEIN1 receptor ( PROTEIN ) and the PROTEIN2 ( PROTEIN ) with their two known substrates PROTEIN and the PROTEIN substrate-1 ( PROTEIN ).)"
478,100.0,"(These data demonstrate that PROTEIN1 but not PROTEIN2 or PROTEIN is directly recruited to the ligand-activated PROTEIN by binding to specific but redundant receptor intracellular domain sequences containing phosphotyrosine., These data demonstrate that PROTEIN1 but not PROTEIN or PROTEIN is directly recruited to the ligand-activated PROTEIN2 by binding to specific but redundant receptor intracellular domain sequences containing phosphotyrosine.)"
209,100.0,"(Previously, a mammalian protein that directly associates with PROTEIN1 -rapamycin has been identified and its encoding gene has been cloned from both human (designated PROTEIN ) [Brown, E.J., Albers, M.W., Shin, T.B., Ichikawa, K., Keith, C.T., Lane, W.S. & Schreiber, S.L. (1994) Nature (London) 369, 756-758] and rat (designated PROTEIN2 ) [Sabatini, D.M., Erdjument-Bromage, H., Lui, M., Tempst, P. & Snyder, S.H. (1994) Cell 78, 35-43]., Previously, a mammalian protein that directly associates with PROTEIN1 -rapamycin has been identified and its encoding gene has been cloned from both human (designated PROTEIN2 ) [Brown, E.J., Albers, M.W., Shin, T.B., Ichikawa, K., Keith, C.T., Lane, W.S. & Schreiber, S.L. (1994) Nature (London) 369, 756-758] and rat (designated PROTEIN ) [Sabatini, D.M., Erdjument-Bromage, H., Lui, M., Tempst, P. & Snyder, S.H. (1994) Cell 78, 35-43].)"
313,100.0,"(Thus, during PROTEIN1 -mediated suppression of cell proliferation, PROTEIN and PROTEIN2 may be important for coordinating cell-cycle progression, DNA replication and repair of damaged DNA., Thus, during PROTEIN -mediated suppression of cell proliferation, PROTEIN1 and PROTEIN2 may be important for coordinating cell-cycle progression, DNA replication and repair of damaged DNA.)"
300,100.0,"(We have found a COOH-terminal 37-kD fragment of PROTEIN1 sufficient to interact with translated PROTEIN and its homologues, suggesting that the PROTEIN2 binding site on PROTEIN occurs on a region that is conserved among the three syntrophin homologues., We have found a COOH-terminal 37-kD fragment of PROTEIN sufficient to interact with translated PROTEIN and its homologues, suggesting that the PROTEIN1 binding site on PROTEIN2 occurs on a region that is conserved among the three syntrophin homologues.)"


In [18]:
df_aimed_random.sort_values(by=["Unigram"], ascending=True).head(n=1)[["Unigram", "Unigram_detail"]].iloc[0]

Unigram           27.4986                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            
Unigram_detail    (While the circulating PROTEIN1 of renal origin is acutely activated to contribute to the accomplishment of short term homeostatic reactions, tissue PROTEIN2 , in paracrine or autocrine ways, exerts local-regional adaptive actions of long duration., In the end we refer to the relations between circulating PROTEIN of renal origin and the extrarenal PROTEIN1 specific to tissue underl

## 3. AIMED (Unqiue Document)

In [19]:
from aimed_uniquedoc import AIMedUniqueDoc

result_score, result_detail = AIMedUniqueDoc().run_similarity_comparer(aimed_file)
df_aimed_unique = scores_to_df(result_score,result_detail, "AIMED (U)", "", "REL")

In [20]:
df_aimed_unique.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (U),count,498.0,498.0,498.0
AIMED (U),mean,67.137026,36.067096,20.770509
AIMED (U),std,15.317615,19.457675,18.481003
AIMED (U),min,28.426762,0.0,0.0
AIMED (U),25%,53.935989,21.629523,8.679295
AIMED (U),50%,68.640647,31.622777,14.629372
AIMED (U),75%,78.920519,44.72136,26.726124
AIMED (U),max,87.017654,80.489529,71.632286


In [21]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Using antibody-mediated immunofluorescence copatching of epitope-tagged receptors, we provide evidence in live cells for preexisting heteromeric ( PROTEIN / PROTEIN and PROTEIN / PROTEIN1 ) and homomeric ( PROTEIN / PROTEIN , PROTEIN / PROTEIN , PROTEIN / PROTEIN , and also PROTEIN2 / PROTEIN ) oligomers in the absence of ligand.',
 ' PROTEIN1 is a potent, tight-binding inhibitor of Cdks and can inhibit the phosphorylation of PROTEIN by PROTEIN - PROTEIN , PROTEIN - PROTEIN , PROTEIN2 - PROTEIN , and PROTEIN - PROTEIN complexes.')

In [22]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=True).head(n=1)["Unigram_detail"].iloc[0]

('In contrast, in normal nasal tissues, PROTEIN1 labeling was only found in the vascular wall, and the expression was weaker--a finding demonstrating that PROTEIN2 is upregulated in nasal polyps.',
 'We found PROTEIN1 mRNA in several peripheral tissues, and detected PROTEIN2 protein on cultured vascular endothelial cells.')

## 4. SST2 Dataset

In [23]:

sst2_train_file = os.path.join(base_data_dir, "train.tsv")
sst2_test_file = os.path.join(base_data_dir, "test.tsv")
sst2_sentiment_labels_file  =  os.path.join(base_data_dir, "sentiment_labels.txt")
sst2_dictionary_file  =  os.path.join(base_data_dir, "dictionary.txt") 



In [24]:
from sst2_dataset import SST2Dataset
result_score, result_detail =  SST2Dataset().run_similarity_comparer(sst2_train_file,sst2_test_file)
df_sst2 = scores_to_df(result_score,result_detail, "SST2", "", "CLS")

In [25]:
df_sst2.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SST2,count,1821.0,1821.0,1821.0
SST2,mean,46.055815,17.378379,1.388049
SST2,std,14.067934,18.009431,6.733712
SST2,min,0.0,0.0,0.0
SST2,25%,36.514837,0.0,0.0
SST2,50%,43.643578,16.666667,0.0
SST2,75%,53.452248,30.151134,0.0
SST2,max,100.0,100.0,70.710678


In [26]:
df_sst2.sort_values(by=["Unigram"], ascending=False)[["Unigram", "Unigram_detail"]].head(n=30)

Unnamed: 0,Unigram,Unigram_detail
462,100.0,"(now , if it only had a brain ., the brain )"
1392,100.0,"(go see it and enjoy ., enjoy this )"
1502,100.0,"(is n't it great ?, great )"
47,100.0,"(how did it ever get made ?, did they not )"
138,100.0,"(i loved it !, loved )"
703,100.0,"(still , i thought it could have been more ., well-thought )"
391,100.0,"(but he somehow pulls it off ., pulls off enough )"
972,100.0,"(it 's painful ., 's that painful . )"
937,100.0,"(it 's laughing at us ., laughing )"
776,100.0,"(she must have a very strong back ., strong as always )"


In [27]:
print(*df_sst2.sort_values(by=["Unigram"], ascending=True)[["Unigram","Unigram_detail"]].iloc[9], sep='\n\n')

21.82178902359924

("herzog is obviously looking for a moral to his fable , but the notion that a strong , unified showing among germany and eastern european jews might have changed 20th-century history is undermined by ahola 's inadequate performance .", 'of the unsung heroes of 20th century ')


## 5. BC3 Article classification

In [28]:
bc3_act_train_file = os.path.join(base_data_dir, "bc3_act_all_records.tsv")
bc3_act_test_file = os.path.join(base_data_dir, "bc3_act_all_records_test.tsv")

bc3_act_test_eval_file = os.path.join(base_data_dir, "bc3_act_gold_standard_test.tsv")

bc3_act_train_eval_file = os.path.join(base_data_dir, "bc3_act_gold_standard.tsv")



In [29]:
from bc3_article_classification import BC3ArticleClassification
bc3actrun = BC3ArticleClassification()

In [30]:


result_score, result_detail = bc3actrun.run_similarity_comparer(bc3_act_train_file,  bc3_act_test_file)
df_bc3_act = scores_to_df(result_score,result_detail, "BC3ACT", "", "CLS")

In [31]:
df_bc3_act.describe()

Unnamed: 0,Unigram,Bigram,Trigram
count,6000.0,6000.0,6000.0
mean,26.760271,6.907863,1.807134
std,9.325118,5.474373,1.728708
min,6.27761,0.0,0.0
25%,20.094071,3.474651,0.835709
50%,25.677218,5.298225,1.476668
75%,31.914209,8.530364,2.352489
max,75.011317,51.197969,18.973666


In [32]:
df_bc3_act.sort_values(by=["Unigram"], ascending=True)[["Unigram", "Unigram_detail"]].head(n=10)

Unnamed: 0,Unigram,Unigram_detail
1819,6.27761,"(Pollination services underpin sustainability of restored ecosystems. Yet, outside of agri-environments, effective restoration of pollinator services in ecological restoration has received little attention. This deficiency in the knowledge needed to restore pollinator capability represents a major liability in restoration programs, particularly in regions where specialist invertebrate and vertebrate pollinators exist, such as global biodiversity hotspots. When compounded with the likely negative impacts of climate change on pollination services, the need to understand and manage pollinator services in restoration becomes paramount., Fhit protein is lost in most cancers, its restoration suppresses tumorigenicity, and virus-mediated FHIT gene therapy induces apoptosis and suppresses tumors in preclinical models. We have used protein cross-linking and proteomics methods to characterize a Fhit protein complex involved in triggering Fhit-mediated apoptosis. The complex includes Hsp60 and Hsp10 that mediate Fhit stability and may affect import into mitochondria, where it interacts with ferredoxin reductase, responsible for transferring electrons from NADPH to cytochrome P450 via ferredoxin. Viral-mediated Fhit restoration increases production of intracellular reactive oxygen species, followed by increased apoptosis of lung cancer cells under oxidative stress conditions; conversely, Fhit-negative cells escape apoptosis, carrying serious oxidative DNA damage that may contribute to an increased mutation rate. Characterization of Fhit interacting proteins has identified direct effectors of the Fhit-mediated apoptotic pathway that is lost in most cancers through loss of Fhit.)"
5631,6.507914,"(The Waorani may have the highest rate of homicide of any society known to anthropology. We interviewed 121 Waorani elders of both sexes to obtain genealogical information and recollections of raids in which they and their relatives participated. We also obtained complete raiding histories of 95 warriors. An analysis of the raiding histories, marital trajectories, and reproductive histories of these men reveals that more aggressive warriors have lower indices of reproductive success than their milder brethren. This result contrasts the findings of Chagnon [Chagnon N (1988) Science 239:985-992] for the Yanomamo. We suggest that the spacing of revenge raids may be involved in the explanation of why the consequences of aggressiveness differ between these 2 warlike lowland South American peoples., Until now, Rho proteins were known as GTPases involved in cell polarity and morphogenesis. In a recent issue of Cell, Espinosa and coworkers show that RhoBTB3, a member of this family, is an ATPase involved in endosome-to-Golgi transport.)"
3184,6.929785,"(We compile over 270 wildlife counts of Kenya's wildlife populations conducted over the last 30 years to compare trends in national parks and reserves with adjacent ecosystems and country-wide trends. The study shows the importance of discriminating human-induced changes from natural population oscillations related to rainfall and ecological factors. National park and reserve populations have declined sharply over the last 30 years, at a rate similar to non-protected areas and country-wide trends. The protected area losses reflect in part their poor coverage of seasonal ungulate migrations. The losses vary among parks. The largest parks, Tsavo East, Tsavo West and Meru, account for a disproportionate share of the losses due to habitat change and the difficulty of protecting large remote parks. The losses in Kenya's parks add to growing evidence for wildlife declines inside as well as outside African parks. The losses point to the need to quantify the performance of conservation policies and promote integrated landscape practices that combine parks with private and community-based measures., The recent identification of copy-number variation in the human genome has opened up new avenues for the discovery of positional candidate genes underlying complex genetic disorders, especially in the field of psychiatric disease. One major challenge that remains is pinpointing the susceptibility genes in the multitude of disease-associated loci. This challenge may be tackled by reconstruction of functional gene-networks from the genes residing in these loci. We applied this approach to autism spectrum disorder (ASD), and identified the copy-number changes in the DNA of 105 ASD patients and 267 healthy individuals with Illumina Humanhap300 Beadchips. Subsequently, we used a human reconstructed gene-network, Prioritizer, to rank candidate genes in the segmental gains and losses in our autism cohort. This analysis highlighted several candidate genes already known to be mutated in cognitive and neuropsychiatric disorders, including RAI1, BRD1, and LARGE. In addition, the LARGE gene was part of a sub-network of seven genes functioning in glycobiology, present in seven copy-number changes specifically identified in autism patients with limited co-morbidity. Three of these seven copy-number changes were de novo in the patients. In autism patients with a complex phenotype and healthy controls no such sub-network was identified. An independent systematic analysis of 13 published autism susceptibility loci supports the involvement of genes related to glycobiology as we also identified the same or similar genes from those loci. Our findings suggest that the occurrence of genomic gains and losses of genes associated with glycobiology are important contributors to the development of ASD.)"
4840,7.267012,"(The president of the Venezuelan Academy of Physical, Mathematical, and Natural Sciences outlines actions of the Venezuelan government that are interfering with independent scientific research in the country., Memory suppression is investigated with the no-think paradigm, which produces forgetting following repeated practice of not thinking about a memory [Anderson MC, Green C (2001) Nature 410:366-369]. Because the forgotten item is not retrieved even when tested with an independent, semantically related cue, it has been assumed that this forgetting is due to an inhibition process. However, this conclusion is based on a single stage to recall, whereas global memory models, which produce forgetting through a process of interference, include both a sampling and a recovery stage to recall. By assuming that interference exists during recovery, these models can explain cue-independent forgetting. We tested several predictions of this interference explanation of cue-independent forgetting by modifying the think/no-think paradigm. We added a condition where participants quickly pressed enter rather than not thinking. We also manipulated initial memory strength and tested recognition memory. Most importantly, learning to quickly press enter produced as much cue-independent forgetting as no-think instructions. Demonstrating the adequacy of two-stage recall, a simple computational model (SAM-RI) simultaneously captured the original cue, independent cue, and recognition results.)"
5787,7.674278,"(Several cellular chaperones have been shown to affect the propagation of the yeast prions [PSI(+)], [PIN(+)] and [URE3]. Ssa1 and Ssa2 are Hsp70 family chaperones that generally cause pro-[PSI(+)] effects, since dominant-negative mutants of Ssa1 or Ssa2 cure [PSI(+)], and overexpression of Ssa1 enhances de novo [PSI(+)] appearance and prevents curing by excess Hsp104. In contrast, Ssa1 was shown to have anti-[URE3] effects, since overexpression of Ssa1 cures [URE3]. Here we show that excess Ssa1 or Ssa2 can also cure [PSI(+)]. This curing is enhanced in the presence of [PIN(+)]. During curing, Sup35-GFP fluorescent aggregates get bigger and fewer in number, which leads to their being diluted out during cell division, a phenotype that was also observed during the curing of [PSI(+)] by certain variants of [PIN(+)]. The sizes of the detergent-resistant [PSI(+)] prion oligomers increase during [PSI(+)] curing by excess Ssa1. Excess Ssa1 likewise leads to an increase in oligomer sizes of low, medium and very high [PIN(+)] variants. While these phenotypes are also caused by inhibition of Hsp104 or Sis1, the overexpression of Ssa1 did not cause any change in Hsp104 or Sis1 levels., High NaCl rapidly activates p38 MAPK by phosphorylating it, the phosphorylation presumably being regulated by a balance of kinases and phosphatases. Kinases are known, but the phosphatases are uncertain. Our initial purpose was to identify the phosphatases. We find that in HEK293 cells transient overexpression of MAPK phosphatase-1 (MKP-1), a dual-specificity phosphatase, inhibits high NaCl-induced phosphorylation of p38, and that overexpression of a dominant negative mutant of MKP-1 does the opposite. High NaCl lowers MKP-1 activity by increasing reactive oxygen species, which directly inhibit MKP-1, and by reducing binding of MKP-1 to p38. Because inhibition of p38 is reported to reduce hypertonicity-induced activation of the osmoprotective transcription factor, TonEBP/OREBP, we anticipated that MKP-1 expression might also. However, overexpression of MKP-1 has no significant effect on Ton EBP/OREBP activity. This paradox is explained by opposing effects of p38alpha and p38delta, both of which are activated by high NaCl and inhibited by MKP-1. Thus, we find that overexpression of p38alpha increases high NaCl-induced TonEBP/OREBP activity, but overexpression of p38delta reduces it. Also, siRNA-mediated knockdown of p38delta enhances the activation of TonEBP/OREBP. We conclude that high NaCl inhibits MKP-1, which contributes to the activation of p38. However, opposing actions of p38alpha and p38delta negate any effect on TonEBP/OREBP activity. Thus, activation of p38 isoforms by hypertonicity does not contribute to activation of TonEBP/OREBP because of opposing effects of p38alpha and p38delta, and effects of inhibitors of p38 depend on which isoform is affected, which can be misleading.)"
2382,7.840151,"(BACKGROUND AND AIMS: Celiac sprue is a life-long disease characterized by an intestinal inflammatory response to dietary gluten. A gluten-free diet is an effective treatment for most patients, but accidental ingestion of gluten is common, leading to incomplete recovery or relapse. Food-grade proteases capable of detoxifying moderate quantities of dietary gluten could mitigate this problem. METHODS: We evaluated the gluten detoxification properties of two food-grade enzymes, aspergillopepsin (ASP) from Aspergillus niger and dipeptidyl peptidase IV (DPPIV) from Aspergillus oryzae. The ability of each enzyme to hydrolyze gluten was tested against synthetic gluten peptides, a recombinant gluten protein, and simulated gastric digests of whole gluten and whole-wheat bread. Reaction products were analyzed by mass spectrometry, HPLC, ELISA with a monoclonal antibody that recognizes an immunodominant gluten epitope, and a T cell proliferation assay. RESULTS: ASP markedly enhanced gluten digestion relative to pepsin, and cleaved recombinant alpha2-gliadin at multiple sites in a non-specific manner. When used alone, neither ASP nor DPPIV efficiently cleaved synthetic immunotoxic gluten peptides. This lack of specificity for gluten was especially evident in the presence of casein, a competing dietary protein. However, supplementation of ASP with DPPIV enabled detoxification of moderate amounts of gluten in the presence of excess casein and in whole-wheat bread. ASP was also effective at enhancing the gluten-detoxifying efficacy of cysteine endoprotease EP-B2 under simulated gastric conditions. CONCLUSIONS: Clinical studies are warranted to evaluate whether a fixed dose ratio combination of ASP and DPPIV can provide near-term relief for celiac patients suffering from inadvertent gluten exposure. Due to its markedly greater hydrolytic activity against gluten than endogenous pepsin, food-grade ASP may also augment the activity of therapeutically relevant doses of glutenases such as EP-B2 and certain prolyl endopeptidases., AIM: To determine the prevalence of a new set of anti-glycan and anti-outer membrane protein (anti-OMP) antibodies in a Hungarian cohort of adult Celiac disease (CD) patients. METHODS: 190 consecutive CD patients [M/F: 71/119, age:39.9 (SD:14.1) years], 100 healthy, and 48 gastrointestinal controls were tested for glycan anti-Saccharomyces cerevisiae (gASCA), anti-laminaribioside (ALCA), anti-chitobioside, anti-mannobioside, anti-OMP antibodies and major NOD2/CARD15 mutations. Thirty out of 82 CD patients enrolled at the time of diagnosis were re-evaluated for the same antibodies after longstanding gluten-free diet (GFD). RESULTS: 65.9% of the CD patients were positive for at least one of the tested antibodies at the time of the diagnosis. Except anti-OMP and ALCA, anti-microbial antibodies were exclusively seen in untreated CD; however, the overall sensitivity was low. Any glycan positivity (LR+: 3.13; 95% CI: 2.08-4.73) was associated with an increased likelihood ratio for diagnosing CD. Significant correlation was found between the levels of anti-glycan and anti-endomysial or anti-transglutaminase antibodies. Anti-glycan positivity was lost after longstanding GFD. Anti-glycan antibody titers were associated with symptoms at presentation, but not the presence of NOD2/CARD15 mutations. Patients with severe malabsorption more frequently had multiple antibodies at diagnosis (P = 0.019). CONCLUSION: The presence of anti-glycan antibodies in CD seems to be secondary to the impaired small bowel mucosa which can lead to increased antigen presentation. Furthermore, anti-glycan positivity may be considered an additional marker of CD and dietary adherence.)"
2941,7.9867,"(The synthesis and photophysical evaluation of two enatiomerially pure dimetallic lanthanide luminescent triple-stranded helicates is described. The two systems, formed from the chiral (R,R) ligand 1 and (S,S) ligand 2, were produced as single species in solution, where the excitation of either the naphthalene antennae or the pyridiyl units gave rise to Eu(III) emission in a variety of solvents. Excitation of the antennae also gave rise to circularly polarized Eu(III) luminescence emissions for Eu(2):1(3) and Eu(2):2(3) that were of equal intensity and opposite sign, confirming their enantiomeric nature in solution providing a basis upon which we were able to assign the absolute configurations of Eu(2):1(3) and Eu(2):2(3)., Toll-like receptors (TLRs) initiate immune responses by recognizing pathogen-associated molecules, but the molecular basis for recognition is poorly understood. In particular, it is unclear how receptor-ligand interactions lead to the initiation of downstream signaling. Here, we describe the mechanism by which TLR3 recognizes its ligand, double-stranded RNA (dsRNA), and forms an active signaling complex. We show that dsRNA binds saturably, specifically, and reversibly to a defined ligand-binding site (or sites) on the TLR3 ectodomain (TLR3ecd). Binding affinities increase with both buffer acidity and ligand size. Purified TLR3ecd protein is exclusively monomeric in solution, but through a highly cooperative process, it forms dimers when bound to dsRNA, and multiple TLR3ecd dimers bind to long dsRNA strands. The smallest dsRNA oligonucleotides that form stable complexes with TLR3ecd (40-50 bp) each bind one TLR3ecd dimer, and these are also the smallest oligonucleotides that efficiently activate TLR3 in cells. We conclude that TLR3 assembles on dsRNA as stable dimers and that the minimal signaling unit is one TLR3 dimer.)"
3775,8.125283,"(The Mori-Zwanzig formalism is an effective tool to derive differential equations describing the evolution of a small number of resolved variables. In this paper we present its application to the derivation of generalized Langevin equations and generalized non-Markovian Fokker-Planck equations. We show how long time scales rates and metastable basins can be extracted from these equations. Numerical algorithms are proposed to discretize these equations. An important aspect is the numerical solution of the orthogonal dynamics equation which is a partial differential equation in a high dimensional space. We propose efficient numerical methods to solve this orthogonal dynamics equation. In addition, we present a projection formalism of the Mori-Zwanzig type that is applicable to discrete maps. Numerical applications are presented from the field of Hamiltonian systems., A new approach for the acquisition of static, wideline (14)N NMR powder patterns is outlined. The method involves the use of frequency-swept pulses which serve two simultaneous functions: (1) broad-band excitation of magnetization and (2) signal enhancement via population transfer. The signal enhancement mechanism is described using numerical simulations and confirmed experimentally. This approach, which we call DEISM (Direct Enhancement of Integer Spin Magnetization), allows high-quality (14)N spectra to be acquired at intermediate field strengths in an uncomplicated way and in a fraction of the time required for previously reported methods.)"
2948,8.193083,"(A cavernous hemangioma of the cecum is a rare vascular malformation but is clinically important because of the possibility of massive bleeding. We report a case of a large cavernous hemangioma with pericolic infiltration in the cecum which was removed successfully using minimally invasive surgery., Hypertension, a major cardiovascular risk factor and cause of mortality worldwide, is thought to arise from primary renal abnormalities. However, the etiology of most cases of hypertension remains unexplained. Vascular tone, an important determinant of blood pressure, is regulated by nitric oxide, which causes vascular relaxation by increasing intracellular cGMP and activating cGMP-dependent protein kinase I (PKGI). Here we show that mice with a selective mutation in the N-terminal protein interaction domain of PKGIalpha display inherited vascular smooth muscle cell abnormalities of contraction, abnormal relaxation of large and resistance blood vessels, and increased systemic blood pressure. Renal function studies and responses to changes in dietary sodium in the PKGIalpha mutant mice are normal. These data reveal that PKGIalpha is required for normal VSMC physiology and support the idea that high blood pressure can arise from a primary abnormality of vascular smooth muscle cell contractile regulation, suggesting a new approach to the diagnosis and therapy of hypertension and cardiovascular diseases.)"
1036,8.277087,"(BACKGROUND: Lévy flights are random walks, the step lengths of which come from probability distributions with heavy power-law tails, such that clusters of short steps are connected by rare long steps. Lévy walks maximise search efficiency of mobile foragers. Recently, several studies raised some concerns about the reliability of the statistical analysis used in previous analyses. Further, it is unclear whether Lévy walks represent adaptive strategies or emergent properties determined by the interaction between foragers and resource distribution. Thus two fundamental questions still need to be addressed: the presence of Lévy walks in the wild and whether or not they represent a form of adaptive behaviour. METHODOLOGY/PRINCIPAL FINDINGS: We studied 235 paths of solitary and clustered (i.e. foraging in group) fallow deer (Dama dama), exploiting the same pasture. We used maximum likelihood estimation for discriminating between a power-tailed distribution and the exponential alternative and rank/frequency plots to discriminate between Lévy walks and composite Brownian walks. We showed that solitary deer perform Lévy searches, while clustered animals did not adopt that strategy. CONCLUSION/SIGNIFICANCE: Our demonstration of the presence of Lévy walks is, at our knowledge, the first available which adopts up-to-date statistical methodologies in a terrestrial mammal. Comparing solitary and clustered deer, we concluded that the Lévy walks of solitary deer represent an adaptation maximising encounter rates with forage resources and not an epiphenomenon induced by a peculiar food distribution., BACKGROUND: A better understanding of the size and abundance of open reading frames (ORFS) in whole genomes may shed light on the factors that control genome complexity. Here we examine the statistical distributions of open reading frames (i.e. distribution of start and stop codons) in the fully sequenced genomes of 297 prokaryotes, and 14 eukaryotes. METHODOLOGY/PRINCIPAL FINDINGS: By fitting mixture models to data from whole genome sequences we show that the size-frequency distributions for ORFS are strikingly similar across prokaryotic and eukaryotic genomes. Moreover, we show that i) a large fraction (60-80%) of ORF size-frequency distributions can be predicted a priori with a stochastic assembly model based on GC content, and that (ii) size-frequency distributions of the remaining &quot;non-random&quot; ORFs are well-fitted by log-normal or gamma distributions, and similar to the size distributions of annotated proteins. CONCLUSIONS/SIGNIFICANCE: Our findings suggest stochastic processes have played a primary role in the evolution of genome complexity, and that common processes govern the conservation and loss of functional genomics units in both prokaryotes and eukaryotes.)"


In [33]:
df_bc3_act.sort_values(by=["Unigram"], ascending=False)[["Unigram", "Unigram_detail"]].head(n=10)

Unnamed: 0,Unigram,Unigram_detail
5630,75.011317,"(The development of T helper (T(H))17 and regulatory T (T(reg)) cells is reciprocally regulated by cytokines. Transforming growth factor (TGF)-beta alone induces FoxP3(+) T(reg) cells, but together with IL-6 or IL-21 induces T(H)17 cells. Here we demonstrate that IL-9 is a key molecule that affects differentiation of T(H)17 cells and T(reg) function. IL-9 predominantly produced by T(H)17 cells, synergizes with TGF-beta1 to differentiate naïve CD4(+) T cells into T(H)17 cells, while IL-9 secretion by T(H)17 cells is regulated by IL-23. Interestingly, IL-9 enhances the suppressive functions of FoxP3(+) CD4(+) T(reg) cells in vitro, and absence of IL-9 signaling weakens the suppressive activity of nT(regs) in vivo, leading to an increase in effector cells and worsening of experimental autoimmune encephalomyelitis. The mechanism of IL-9 effects on T(H)17 and T(regs) is through activation of STAT3 and STAT5 signaling. Our findings highlight a role of IL-9 as a regulator of pathogenic versus protective mechanisms of immune responses., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
3873,74.535183,"(CD1d-restricted invariant NKT (iNKT) cells play crucial roles in various types of immune responses, including autoimmune diseases, infectious diseases and tumor surveillance. The mechanisms underlying their adjuvant functions are well understood. Nevertheless, although IL-4 and IL-10 production characterize iNKT cells able to prevent or ameliorate some autoimmune diseases and inflammatory conditions, the precise mechanisms by which iNKT cells exert immune regulatory function remain elusive. This study demonstrates that the activation of human iNKT cells by their specific ligand alpha-galactosylceramide enhances IL-12p70 while inhibiting the IL-23 production by monocyte-derived dendritic cells, and in turn down-regulating the IL-17 production by memory CD4(+) Th cells. The ability of the iNKT cells to regulate the differential production of IL-12p70/IL-23 is mainly mediated by a remarkable hallmark of their function to produce both Th1 and Th2 cytokines. In particular, the down-regulation of IL-23 is markedly associated with a production of IL-4 and IL-10 from iNKT cells. Moreover, Th2 cytokines, such as IL-4 and IL-13 play a crucial role in defining the biased production of IL-12p70/IL-23 by enhancement of IL-12p70 in synergy with IFN-gamma, whereas inhibition of the IFN-gamma-promoted IL-23 production. Collectively, the results suggest that iNKT cells modify the IL-12p70/IL-23 balance to enhance the IL-12p70-induced cell-mediated immunity and suppress the IL-23-dependent inflammatory pathologies. These results may account for the long-appreciated contrasting beneficial and adverse consequence of ligand activation of iNKT cells., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
2623,74.01154,"(IL (interleukin)-4 and IL-13 are key cytokines in the pathogenesis of allergic inflammatory disease. IL-4 and IL-13 share many functional properties as a result of their utilization of a common receptor complex comprising IL-13Ralpha1 (IL-13 receptor alpha-chain 1) and IL-4Ralpha. The second IL-13R (IL-13 receptor) has been identified, namely IL-13Ralpha2. This has been thought to be a decoy receptor due to its short cytoplasmic tail and its high binding affinity for IL-13 but not IL-4. IL-13Ralpha2 exists on the cell membrane, intracellularly and in a soluble form. Recent reports revealed that membrane IL-13Ralpha2 may have some signalling capabilities, and a soluble form of IL-13Ralpha2 can be generated in the presence of environmental allergens such as DerP. Interestingly, IL-13Ralpha2 has also been shown to regulate both IL-13 and IL-4 response in primary airway cells, despite the fact that IL-13Ralpha2 does not bind IL-4. The regulator mechanism is still unclear but the physical association of IL-13Ralpha2 with IL-4Ralpha appears to be a key regulatory step. These results suggest that the cytoplasmic tail of IL-13Ralpha2 may interfere with the association or activation of signalling molecules, such as JAK1 (Janus kinase 1), on IL-4Ralpha and thus prevents downstream signal cascade. The receptor has more complicated functions than a simple decoy receptor. In this review, we discuss newly revealed functions of IL-13Ralpha2., Lack of the IL-1 receptor accessory protein (IL-1RAcP) abrogates responses to IL-33 and IL-1 in the mouse thymoma clone EL-4 D6/76 cells. Reconstitution with full-length IL-1RAcP is sufficient to restore responsiveness to IL-33 and IL-1. IL-33 activates IL-1 receptor-associated kinase-1, cJun-N-terminal kinase, and the NF-kappaB pathway in an IL-1RAcP-dependent manner and results in IL-2 release. IL-33 is able to induce the release of proinflammatory cytokines in bone marrow-derived (BMD) mast cells, indicating that IL-33 may have a proinflammatory potential like its relatives IL-1 and IL-18, in addition to its Th2-skewing properties in the adaptive response described previously. Blocking of murine IL-1RAcP with the neutralizing antibody 4C5 inhibits response of mouse thymoma cells and BMD mast cells to IL-33. The interaction of either membrane-bound or soluble forms of IL-1RAcP and IL-33Ralpha-chain depends on the presence of IL-33, as demonstrated by coimmunoprecipitation assays. These data demonstrate that IL-1RAcP is indispensable for IL-33 signaling. Furthermore, they suggest that IL-1RAcP is used by more than one alpha-chain of the IL-1 receptor family and thus may resemble a common beta-chain of that family.)"
4000,71.268446,"(MC2 (ACTH) receptors require MC2 receptor accessory protein (MRAP) to reach the cell surface. In this study, we show that MRAP has the opposite effect on the closely related MC5 receptor. In enzyme-linked immunosorbent assay and microscopy experiments, MC2 receptor was retained in the endoplasmic reticulum in the absence of MRAP and targeted to the plasma membrane with MRAP. MC5 receptor was at the plasma membrane in the absence of MRAP, but trapped intracellularly when expressed with MRAP. Using bimolecular fluorescence complementation, where one fragment of yellow fluorescent protein (YFP) was fused to receptors and another to MRAP, we showed that MC2 receptor-MRAP dimers were present at the plasma membrane, whereas MC5 receptor-MRAP dimers were intracellular. Both MC2 and MC5 receptors co-precipitated with MRAP. MRAP did not alter expression of beta2-adrenergic receptors or co-precipitate with them. To determine if MRAP affects formation of receptor oligomers, we co-expressed MC2 receptors fused to YFP fragments in the presence or absence of MRAP. YFP fluorescence, reporting MC2 receptor homodimers, was readily detectable with or without MRAP. In contrast, MC5 receptor homodimers were visible in the absence of MRAP, but little fluorescence was observed by microscopic analysis when MRAP was co-expressed. Co-precipitation of differentially tagged receptors confirmed that MRAP blocks MC5 receptor dimerization. The regions of MRAP required for its effects on MC2 and MC5 receptors differed. These results establish that MRAP forms stable complexes with two different melanocortin receptors, facilitating surface expression of MC2 receptor but disrupting dimerization and surface localization of MC5 receptor., The melanocortin-2 (MC2) receptor accessory protein (MRAP) is required for trafficking of the G protein-coupled MC2 receptor to the plasma membrane. The mechanism of action and structure of MRAP, which has a single transmembrane domain, are unknown. Here, we show that MRAP displays a previously uncharacterized topology. Epitopes on both the N- and C-terminal ends of MRAP were localized on the external face of CHO cells at comparable levels. Using antibodies raised against N- and C-terminal MRAP peptides, we demonstrated that both ends of endogenous MRAP face the outside in adrenal cells. Nearly half of MRAP was glycosylated at the single endogenous N-terminal glycosylation site, and over half was glycosylated when the natural glycosylation site was replaced by one in the C-terminal domain. A mutant MRAP with potential glycosylation sites on both sides of the membrane was singly but not doubly glycosylated, suggesting that MRAP is not monotopic. Coimmunoprecipitation of differentially tagged MRAPs established that MRAP is a dimer. By selectively immunoprecipitating cell surface MRAP in one or the other orientation, we showed that MRAP homodimers are antiparallel and form a stable complex with MC2 receptor. In the absence of MRAP, MC2 receptor was trapped in the endoplasmic reticulum, but with MRAP, the MC2 receptor was glycosylated and localized on the plasma membrane, where it signaled in response to ACTH. MRAP acted specifically, because it did not increase surface expression of other melanocortin, beta2-adrenergic, or TSH-releasing hormone receptors. MRAP is the first eukaryotic membrane protein identified with an antiparallel homodimeric structure.)"
4370,69.897546,"(Using biopsy specimens from patients with B-cell non-Hodgkin's lymphoma, we observed a significantly low frequency of T(H)17 cells, including several samples with no detectable amount of interleukin (IL)-17-producing cells present in the tumor microenvironment. We found that, in the absence of lymphoma B cells, treatment with IL-1beta/IL-6 or lipopolysaccharide (LPS) enhanced IL-17 expression in CD4(+) T cells and this enhancement was attenuated when CD4(+) T cells were cocultured with lymphoma B cells. Blockade of CD27-CD70 or CD28-CD80/86 interactions by anti-CD70 or anti-CD80/86 antibodies restored LPS-mediated induction of IL-17 expression in CD4(+) T cells cocultured with lymphoma B cells. Because a subset of lymphoma B cells express IL-2 and given that IL-2 signaling is critically important in the development of regulatory T (T(reg)) cells, we tested the role of IL-2 signaling in T(H)17 cell development. We found that treatment with anti-IL-2 antibody to interrupt IL-2 signaling significantly inhibited Foxp3 expression in CD4(+) T cells. In contrast, interruption of IL-2 signaling up-regulated IL-17 expression in CD4(+) T cells and restored lymphoma-mediated down-regulation of IL-17-producing cells. Furthermore, the reversal of T(reg) cell activity by LPS or CpG-A resulted in an enhancement of IL-17-producing cells. Taken together, our study indicated that lymphoma B cells play an important role in skewing the balance between T(reg) and T(H)17 cells resulting in the establishment of a profoundly inhibitory tumor microenvironment., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
5392,68.417062,"(Interleukin 23 (IL-23) and IL-17 have been linked to the pathogenesis of several chronic inflammatory disorders, including inflammatory bowel disease. Yet as an important function for IL-23 is emerging, the function of IL-17 in inflammatory bowel disease remains unclear. Here we demonstrate IL-17A-mediated protection in the CD45RBhi transfer model of colitis. An accelerated wasting disease elicited by T cells deficient in IL-17A correlated with higher expression of genes encoding T helper type 1-type cytokines in colon tissue. IL-17A also modulated T helper type 1 polarization in vitro. Furthermore, T cells deficient in the IL-17 receptor elicited an accelerated, aggressive wasting disease relative to that elicited by wild-type T cells in recipient mice. Our data demonstrate a protective function for IL-17 and identify T cells as not only the source but also a target of IL-17 in vivo., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
1683,66.71746,"(Lymphocyte recruitment and activation have been implicated in the progression of cerebral ischemia-reperfusion (I/R) injury, but the roles of specific lymphocyte subpopulations and cytokines during stroke remain to be clarified. Here we demonstrate that the infiltration of T cells into the brain, as well as the cytokines interleukin-23 (IL-23) and IL-17, have pivotal roles in the evolution of brain infarction and accompanying neurological deficits. Blockade of T cell infiltration into the brain by the immunosuppressant FTY720 reduced I/R-induced brain damage. The expression of IL-23, which was derived mostly from infiltrated macrophages, increased on day 1 after I/R, whereas IL-17 levels were elevated after day 3, and this induction of IL-17 was dependent on IL-23. These data, together with analysis of mice genetically disrupted for IL-17 and IL-23, suggest that IL-23 functions in the immediate stage of I/R brain injury, whereas IL-17 has an important role in the delayed phase of I/R injury during which apoptotic neuronal death occurs in the penumbra. Intracellular cytokine staining revealed that gammadeltaT lymphocytes, but not CD4(+) helper T cells, were a major source of IL-17. Moreover, depletion of gammadeltaT lymphocytes ameliorated the I/R injury. We propose that T lymphocytes, including gammadeltaT lymphocytes, could be a therapeutic target for mitigating the inflammatory events that amplify the initial damage in cerebral ischemia., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
2997,66.225136,"(We report that like other T cells cultured in the presence of transforming growth factor (TGF) beta, Th17 cells also produce interleukin (IL) 9. Th17 cells generated in vitro with IL-6 and TGF-beta as well as purified ex vivo Th17 cells both produced IL-9. To determine if IL-9 has functional consequences in Th17-mediated inflammatory disease, we evaluated the role of IL-9 in the development and progression of experimental autoimmune encephalomyelitis, a mouse model of multiple sclerosis. The data show that IL-9 neutralization and IL-9 receptor deficiency attenuates disease, and this correlates with decreases in Th17 cells and IL-6-producing macrophages in the central nervous system, as well as mast cell numbers in the regional lymph nodes. Collectively, these data implicate IL-9 as a Th17-derived cytokine that can contribute to inflammatory disease., Th17 cells, CD4(+) T cells that secrete interleukin-17 (IL-17), are pathogenic in autoimmune diseases and their development and expansion is driven by the cytokines IL-6, TGF-beta, IL-21, IL-1, and IL-23. However, there are also innate sources of IL-17. Here, we show that gammadelta T cells express IL-23R and the transcription factor RORgammat and produce IL-17, IL-21, and IL-22 in response to IL-1beta and IL-23, without T cell receptor engagement. IL-17-producing gammadelta T cells were found at high frequency in the brain of mice with experimental autoimmune encephalomyelitis (EAE). gammadelta T cells activated by IL-1beta and IL-23 promoted IL-17 production by CD4(+) T cells and increased susceptibility to EAE, suggesting that gammadelta T cells act in an amplification loop for IL-17 production by Th17 cells. Our findings demonstrate that gammadelta T cells activated by IL-1beta and IL-23 are an important source of innate IL-17 and IL-21 and provide an alternative mechanism whereby IL-1 and IL-23 may mediate autoimmune inflammation.)"
1128,65.559946,"(B-cell survival depends on signals induced by B-cell activating factor (BAFF) binding to its receptor (BAFF-R). In mice, mutations in BAFF or BAFF-R cause B-cell lymphopenia and antibody deficiency. Analyzing BAFF-R expression and BAFF-binding to B cells in common variable immunodeficiency (CVID) patients, we identified two siblings carrying a homozygous deletion in the BAFF-R gene. Removing most of the BAFF-R transmembrane part, the deletion precludes BAFF-R expression. Without BAFF-R, B-cell development is arrested at the stage of transitional B cells and the numbers of all subsequent B-cell stages are severely reduced. Both siblings have lower IgG and IgM serum levels but, unlike most CVID patients, normal IgA concentrations. They also did not mount a T-independent immune response against pneumococcal cell wall polysaccharides but only one BAFF-R-deficient sibling developed recurrent infections. Therefore, deletion of the BAFF-R gene in humans causes a characteristic immunological phenotype but it does not necessarily lead to a clinically manifest immunodeficiency., Defects in the expression of either BAFF (B cell activating factor) or BAFF-R impairs B cell development beyond the immature, transitional type-1 stage and thus, prevents the formation of follicular and marginal zone B cells, whereas B-1 B cells remain unaffected. The expression of BAFF-R on all mature B cells might suggest a role for BAFF-R signaling also for their in vivo maintenance. Here, we show that, 14 days following a single injection of an anti-BAFF-R mAb that prevents BAFF binding, both follicular and marginal zone B cell numbers are drastically reduced, whereas B-1 cells are not affected. Injection of control, isotype-matched but non-blocking anti-BAFF-R mAbs does not result in B cell depletion. We also show that this depletion is neither due to antibody-dependent cellular cytotoxicity nor to complement-mediated lysis. Moreover, prevention of BAFF binding leads to a decrease in the size of the B cell follicles, an impairment of a T cell dependent humoral immune response and a reduction in the formation of memory B cells. Collectively, these results establish a central role for BAFF-BAFF-R signaling in the in vivo survival and maintenance of both follicular and marginal zone B cell pools.)"
3153,63.458674,"(IL-1 is a potent cytokine that can induce bone erosion in inflammatory sites such as rheumatoid joint regions via activation of osteoclasts. Not only is IL-1 capable of activating osteoclasts, but it is also a key cytokine involved in the differentiation, multinucleation, and survival of osteoclasts. Herein, we show that IL-1 has the potential to drive osteoclast differentiation via a receptor activator of NF-kappaB ligand (RANKL)/RANK-independent mechanism. Although IL-1 has a synergistic effect on RANKL-induced osteoclast formation, IL-1 alone cannot induce osteoclast differentiation from osteoclast precursors (bone marrow-derived macrophages (BMMs)) due to a lack of IL-1 signaling potential in these cells. However, we demonstrate that overexpression of the IL-1RI receptor in BMMs or induction of IL-1RI by c-Fos overexpression enables IL-1 alone to induce the formation of authentic osteoclasts by a RANKL/RANK-independent mechanism. The expression of IL-1RI is up-regulated by RANKL via c-Fos and NFATc1. Furthermore, the addition of IL-1 to IL-1RI overexpressing BMMs (IL-1/IL-1RI) strongly activates NF-kappaB, JNK, p38, and ERK which is a hallmark gene activation profile of osteoclastogenesis. Interestingly, IL-1/IL-1RI does not induce expression of c-Fos or NFATc1 during osteoclast differentiation, although basal levels of c-Fos and NFATc1 seem to be required. Rather, IL-1/IL-1RI strongly activates MITF, which subsequently induces osteoclast-specific genes such as osteoclast-associated receptor and tartrate-resistant acid phosphatase. Together, these results reveal that IL-1 has the potential to induce osteoclast differentiation via activation of microphthalmia transcription factor under specific microenvironmental conditions., Lack of the IL-1 receptor accessory protein (IL-1RAcP) abrogates responses to IL-33 and IL-1 in the mouse thymoma clone EL-4 D6/76 cells. Reconstitution with full-length IL-1RAcP is sufficient to restore responsiveness to IL-33 and IL-1. IL-33 activates IL-1 receptor-associated kinase-1, cJun-N-terminal kinase, and the NF-kappaB pathway in an IL-1RAcP-dependent manner and results in IL-2 release. IL-33 is able to induce the release of proinflammatory cytokines in bone marrow-derived (BMD) mast cells, indicating that IL-33 may have a proinflammatory potential like its relatives IL-1 and IL-18, in addition to its Th2-skewing properties in the adaptive response described previously. Blocking of murine IL-1RAcP with the neutralizing antibody 4C5 inhibits response of mouse thymoma cells and BMD mast cells to IL-33. The interaction of either membrane-bound or soluble forms of IL-1RAcP and IL-33Ralpha-chain depends on the presence of IL-33, as demonstrated by coimmunoprecipitation assays. These data demonstrate that IL-1RAcP is indispensable for IL-33 signaling. Furthermore, they suggest that IL-1RAcP is used by more than one alpha-chain of the IL-1 receptor family and thus may resemble a common beta-chain of that family.)"


In [34]:
df_bc3_act.head()


Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
0,24.053512,3.221897,0.0,"(A genetic tool to introduce marker-free deletions is essential for multiple manipulations of genomes. We report a simple and efficient method to create marker-free deletion mutants of Bacillus subtilis through transformation with recombinant PCR products, using the Escherichia coli mazF gene encoding an endoribonuclease that cleaves free mRNAs as a counter-selection tool. Our method will be applicable to any bacterium in which introduction of the mazF cassette into the genome by double crossover homologous recombination is possible., The next-generation sequencing technology coupled with the growing number of genome sequences opens the opportunity to redesign genotyping strategies for more effective genetic mapping and genome analysis. We have developed a high-throughput method for genotyping recombinant populations utilizing whole-genome resequencing data generated by the Illumina Genome Analyzer. A sliding window approach is designed to collectively examine genome-wide single nucleotide polymorphisms for genotype calling and recombination breakpoint determination. Using this method, we constructed a genetic map for 150 rice recombinant inbred lines with an expected genotype calling accuracy of 99.94% and a resolution of recombination breakpoints within an average of 40 kb. In comparison to the genetic map constructed with 287 PCR-based markers for the rice population, the sequencing-based method was approximately 20x faster in data collection and 35x more precise in recombination breakpoint determination. Using the sequencing-based genetic map, we located a quantitative trait locus of large effect on plant height in a 100-kb region containing the rice &quot;green revolution&quot; gene. Through computer simulation, we demonstrate that the method is robust for different types of mapping populations derived from organisms with variable quality of genome sequences and is feasible for organisms with large genome sizes and low polymorphisms. With continuous advances in sequencing technologies, this genome-based method may replace the conventional marker-based genotyping approach to provide a powerful tool for large-scale gene discovery and for addressing a wide range of biological questions.)","(A genetic tool to introduce marker-free deletions is essential for multiple manipulations of genomes. We report a simple and efficient method to create marker-free deletion mutants of Bacillus subtilis through transformation with recombinant PCR products, using the Escherichia coli mazF gene encoding an endoribonuclease that cleaves free mRNAs as a counter-selection tool. Our method will be applicable to any bacterium in which introduction of the mazF cassette into the genome by double crossover homologous recombination is possible., Rickettsia prowazekii, the causative agent of epidemic typhus, is an obligately intracytoplasmic bacterium, a lifestyle that imposes significant barriers to genetic manipulation. The key to understanding how this unique bacterium evades host immunity is the mutagenesis of selected genes hypothesized to be involved in virulence. The R. prowazekii pld gene, encoding a protein with phospholipase D activity, has been associated with phagosomal escape. To demonstrate the feasibility of site-directed knockout mutagenesis of rickettsial genes and to generate a nonrevertible vaccine strain, we utilized homologous recombination to generate a pld mutant of the virulent R. prowazekii strain Madrid Evir. Using linear DNA for transformation, a double-crossover event resulted in the replacement of the rickettsial wild-type gene with a partially deleted pld gene. Linear DNA was used to prevent potentially revertible single-crossover events resulting in plasmid insertion. Southern blot and PCR analyses were used to confirm the presence of the desired mutation and to demonstrate clonality. While no phenotypic differences were observed between the mutant and wild-type strains when grown in tissue culture, the pld mutant exhibited attenuated virulence in the guinea pig model. In addition, animals immunized with the mutant strain were protected against subsequent challenge with the virulent Breinl strain, suggesting that this transformant could serve as a nonrevertible, attenuated vaccine strain. This study demonstrates the feasibility of generating site-directed rickettsial gene mutants, providing a new tool for understanding rickettsial biology and furthering advances in the prevention of epidemic typhus.)","(A genetic tool to introduce marker-free deletions is essential for multiple manipulations of genomes. We report a simple and efficient method to create marker-free deletion mutants of Bacillus subtilis through transformation with recombinant PCR products, using the Escherichia coli mazF gene encoding an endoribonuclease that cleaves free mRNAs as a counter-selection tool. Our method will be applicable to any bacterium in which introduction of the mazF cassette into the genome by double crossover homologous recombination is possible., Programmed cell death (PCD) is a genetically-controlled disassembly of the cell. In animal systems, the central core execution switch for apoptotic PCD is the activation of caspases (Cysteine-containing Aspartate-specific proteases). Accumulating evidence in recent years suggests the existence of caspase-like activity in plants and its functional involvement in various types of plant PCD, although no functional homologs of animal caspases were identified in plant genome. In this mini-review, we will cover the recent results on the existence of plant caspase-like proteases and introduce major technologies used in detecting the activation of caspase-like proteases during plant PCD.)",BC3ACT,CLS
1,22.905071,3.087832,1.346595,"(Mitochondrial (mt) heteroplasmy in the control region (CR) of the black-faced spoonbill was investigated using LA-PCR. To avoid amplification of transpositioned nuclear genome fragment from mtDNA (numt), PCR product of the almost-complete mitochondrial genome was amplified using primers designed to anneal on the COIII gene. Then nested LA-PCR product was amplified between the cyt b and 12S rRNA genes using the almost-complete mitochondrial genome PCR product as a template. Nucleotide sequencing revealed tandem duplication composed of two units. The first contains cyt b-1, tRNA(Thr)-1, tRNA(Pro)-1, ND6-1, tRNA(Glu)-1 and CR1, and the second consists of cyt b-2, tRNA(Thr)-2, tRNA(Pro)-2, ND6-2, tRNA(Glu)-2 and CR2, followed by tRNA(Phe) and 12S rRNA. The duplicated cyt b-2 sequence coincided with 499 bp at the 3' end of cyt b-1. With the exception of the CR, the other genes in the duplicated sequence were identical to the original corresponding gene. Even though both CR1 and CR2 contain functional blocks, such as a poly-C site, a goose hairpin and a TAS structure in Domain I, the 3' end of CR1 was followed by a 112 bp sequence (non-coding region) that was not found in CR2 or in sequence homology analysis of similar genes. Meanwhile, CR2 ended in a complicated repeat sequence. The 5' franking region in the Domain I (Region A) and the 3' franking region in the Domain I (Region B) of the two CRs evolve in quite different manners: Region A was highly variable between CR1 and CR2 in the same individuals, while Region B was almost identical between them, which indicates concerted evolution., Selenocysteine is the only genetically encoded amino acid in humans whose biosynthesis occurs on its cognate transfer RNA (tRNA). O-Phosphoseryl-tRNA:selenocysteinyl-tRNA synthase (SepSecS) catalyzes the final step of selenocysteine formation by a poorly understood tRNA-dependent mechanism. The crystal structure of human tRNA(Sec) in complex with SepSecS, phosphoserine, and thiophosphate, together with in vivo and in vitro enzyme assays, supports a pyridoxal phosphate-dependent mechanism of Sec-tRNA(Sec) formation. Two tRNA(Sec) molecules, with a fold distinct from other canonical tRNAs, bind to each SepSecS tetramer through their 13-base pair acceptor-TPsiC arm (where Psi indicates pseudouridine). The tRNA binding is likely to induce a conformational change in the enzyme's active site that allows a phosphoserine covalently attached to tRNA(Sec), but not free phosphoserine, to be oriented properly for the reaction to occur.)","(Mitochondrial (mt) heteroplasmy in the control region (CR) of the black-faced spoonbill was investigated using LA-PCR. To avoid amplification of transpositioned nuclear genome fragment from mtDNA (numt), PCR product of the almost-complete mitochondrial genome was amplified using primers designed to anneal on the COIII gene. Then nested LA-PCR product was amplified between the cyt b and 12S rRNA genes using the almost-complete mitochondrial genome PCR product as a template. Nucleotide sequencing revealed tandem duplication composed of two units. The first contains cyt b-1, tRNA(Thr)-1, tRNA(Pro)-1, ND6-1, tRNA(Glu)-1 and CR1, and the second consists of cyt b-2, tRNA(Thr)-2, tRNA(Pro)-2, ND6-2, tRNA(Glu)-2 and CR2, followed by tRNA(Phe) and 12S rRNA. The duplicated cyt b-2 sequence coincided with 499 bp at the 3' end of cyt b-1. With the exception of the CR, the other genes in the duplicated sequence were identical to the original corresponding gene. Even though both CR1 and CR2 contain functional blocks, such as a poly-C site, a goose hairpin and a TAS structure in Domain I, the 3' end of CR1 was followed by a 112 bp sequence (non-coding region) that was not found in CR2 or in sequence homology analysis of similar genes. Meanwhile, CR2 ended in a complicated repeat sequence. The 5' franking region in the Domain I (Region A) and the 3' franking region in the Domain I (Region B) of the two CRs evolve in quite different manners: Region A was highly variable between CR1 and CR2 in the same individuals, while Region B was almost identical between them, which indicates concerted evolution., Plasmodium falciparum is the major human malaria agent responsible for 200 to 300 million infections and one to three million deaths annually, mainly among African infants. The origin and evolution of this pathogen within the human lineage is still unresolved. A single species, P. reichenowi, which infects chimpanzees, is known to be a close sister lineage of P. falciparum. Here we report the discovery of a new Plasmodium species infecting Hominids. This new species has been isolated in two chimpanzees (Pan troglodytes) kept as pets by villagers in Gabon (Africa). Analysis of its complete mitochondrial genome (5529 nucleotides including Cyt b, Cox I and Cox III genes) reveals an older divergence of this lineage from the clade that includes P. falciparum and P. reichenowi (approximately 21+/-9 Myrs ago using Bayesian methods and considering that the divergence between P. falciparum and P. reichenowi occurred 4 to 7 million years ago as generally considered in the literature). This time frame would be congruent with the radiation of hominoids, suggesting that this Plasmodium lineage might have been present in early hominoids and that they may both have experienced a simultaneous diversification. Investigation of the nuclear genome of this new species will further the understanding of the genetic adaptations of P. falciparum to humans. The risk of transfer and emergence of this new species in humans must be now seriously considered given that it was found in two chimpanzees living in contact with humans and its close relatedness to the most virulent agent of malaria.)","(Mitochondrial (mt) heteroplasmy in the control region (CR) of the black-faced spoonbill was investigated using LA-PCR. To avoid amplification of transpositioned nuclear genome fragment from mtDNA (numt), PCR product of the almost-complete mitochondrial genome was amplified using primers designed to anneal on the COIII gene. Then nested LA-PCR product was amplified between the cyt b and 12S rRNA genes using the almost-complete mitochondrial genome PCR product as a template. Nucleotide sequencing revealed tandem duplication composed of two units. The first contains cyt b-1, tRNA(Thr)-1, tRNA(Pro)-1, ND6-1, tRNA(Glu)-1 and CR1, and the second consists of cyt b-2, tRNA(Thr)-2, tRNA(Pro)-2, ND6-2, tRNA(Glu)-2 and CR2, followed by tRNA(Phe) and 12S rRNA. The duplicated cyt b-2 sequence coincided with 499 bp at the 3' end of cyt b-1. With the exception of the CR, the other genes in the duplicated sequence were identical to the original corresponding gene. Even though both CR1 and CR2 contain functional blocks, such as a poly-C site, a goose hairpin and a TAS structure in Domain I, the 3' end of CR1 was followed by a 112 bp sequence (non-coding region) that was not found in CR2 or in sequence homology analysis of similar genes. Meanwhile, CR2 ended in a complicated repeat sequence. The 5' franking region in the Domain I (Region A) and the 3' franking region in the Domain I (Region B) of the two CRs evolve in quite different manners: Region A was highly variable between CR1 and CR2 in the same individuals, while Region B was almost identical between them, which indicates concerted evolution., Plasmodium falciparum is the major human malaria agent responsible for 200 to 300 million infections and one to three million deaths annually, mainly among African infants. The origin and evolution of this pathogen within the human lineage is still unresolved. A single species, P. reichenowi, which infects chimpanzees, is known to be a close sister lineage of P. falciparum. Here we report the discovery of a new Plasmodium species infecting Hominids. This new species has been isolated in two chimpanzees (Pan troglodytes) kept as pets by villagers in Gabon (Africa). Analysis of its complete mitochondrial genome (5529 nucleotides including Cyt b, Cox I and Cox III genes) reveals an older divergence of this lineage from the clade that includes P. falciparum and P. reichenowi (approximately 21+/-9 Myrs ago using Bayesian methods and considering that the divergence between P. falciparum and P. reichenowi occurred 4 to 7 million years ago as generally considered in the literature). This time frame would be congruent with the radiation of hominoids, suggesting that this Plasmodium lineage might have been present in early hominoids and that they may both have experienced a simultaneous diversification. Investigation of the nuclear genome of this new species will further the understanding of the genetic adaptations of P. falciparum to humans. The risk of transfer and emergence of this new species in humans must be now seriously considered given that it was found in two chimpanzees living in contact with humans and its close relatedness to the most virulent agent of malaria.)",BC3ACT,CLS
2,27.548932,4.184284,0.924027,"(In mangrove species the past geomorphic changes in coastal regions and reproductive systems are important factors of their distribution and genetic structure of populations. However, very little is known about the levels of genetic variation of Rhiozophora species in Southeast Asia. In this study, we surveyed levels and patterns of genetic variation as well as population structure of two sympatric mangrove species, Rhizophora apiculata and R. mucronata in Thailand, using five nuclear genes and two cpDNA regions. In all investigated DNA regions, nucleotide variation within species was low, while nucleotide divergence between the two species was considerable. The nuclear genes evolved 10 times faster than the cpDNA regions. In both R. apiculata and R. mucronata, significant positive F(IS) values were found, indicating deviation from Hardy-Weinberg proportions and a deficiency of heterozygotes. In both species, we found significant genetic differentiation between populations. However, the pattern of population differentiation (F(ST)) of R. apiculata differed from that of R. mucronata. Our results suggest that the two investigated species have different demographic history, even though they are sympatric and have similar reproductive systems., Whether or not bacteria form coherent evolutionary groups via means of genetic exchange and, hence, elicit distinct species boundaries remains an unsettled issue. A recent report implied that not only may the former be true but also, in fact, the clearly distinct Campylobacter jejuni and Campylobacter coli species may be converging as a consequence of increased interspecies gene flow fostered, presumably, by the recent invasion of an overlapping ecological niche (S. K. Sheppard, N. D. McCarthy, D. Falush, and M. C. Maiden, Science 320:237-239, 2008). We have reanalyzed the Campylobacter multilocus sequence typing database used in the previous study and found that the number of interspecies gene transfer events may actually be too infrequent to account, unequivocally, for species convergence. For instance, only 1 to 2% of the 4,507 Campylobacter isolates examined appeared to have imported gene alleles from another Campylobacter species. Furthermore, by analyzing the available Campylobacter genomic sequences, we show that although there seems to be a slightly higher number of exchanged genes between C. jejuni and C. coli relative to other comparable species ( approximately 10% versus 2 to 3% of the total genes in the genome, respectively), the function and spatial distribution in the genome of the exchanged genes are far from random, and hence, inconsistent with the species convergence hypothesis. In fact, the exchanged genes appear to be limited to a few environmentally selected cellular functions. Accordingly, these genes may represent important pathogenic determinants of pathogenic Campylobacter, and convergence of (any) two bacterial species remains to be seen.)","(In mangrove species the past geomorphic changes in coastal regions and reproductive systems are important factors of their distribution and genetic structure of populations. However, very little is known about the levels of genetic variation of Rhiozophora species in Southeast Asia. In this study, we surveyed levels and patterns of genetic variation as well as population structure of two sympatric mangrove species, Rhizophora apiculata and R. mucronata in Thailand, using five nuclear genes and two cpDNA regions. In all investigated DNA regions, nucleotide variation within species was low, while nucleotide divergence between the two species was considerable. The nuclear genes evolved 10 times faster than the cpDNA regions. In both R. apiculata and R. mucronata, significant positive F(IS) values were found, indicating deviation from Hardy-Weinberg proportions and a deficiency of heterozygotes. In both species, we found significant genetic differentiation between populations. However, the pattern of population differentiation (F(ST)) of R. apiculata differed from that of R. mucronata. Our results suggest that the two investigated species have different demographic history, even though they are sympatric and have similar reproductive systems., Mexico is developing the basis for genomic medicine to improve healthcare of its population. The extensive study of genetic diversity and linkage disequilibrium structure of different populations has made it possible to develop tagging and imputation strategies to comprehensively analyze common genetic variation in association studies of complex diseases. We assessed the benefit of a Mexican haplotype map to improve identification of genes related to common diseases in the Mexican population. We evaluated genetic diversity, linkage disequilibrium patterns, and extent of haplotype sharing using genomewide data from Mexican Mestizos from regions with different histories of admixture and particular population dynamics. Ancestry was evaluated by including 1 Mexican Amerindian group and data from the HapMap. Our results provide evidence of genetic differences between Mexican subpopulations that should be considered in the design and analysis of association studies of complex diseases. In addition, these results support the notion that a haplotype map of the Mexican Mestizo population can reduce the number of tag SNPs required to characterize common genetic variation in this population. This is one of the first genomewide genotyping efforts of a recently admixed population in Latin America.)","(In mangrove species the past geomorphic changes in coastal regions and reproductive systems are important factors of their distribution and genetic structure of populations. However, very little is known about the levels of genetic variation of Rhiozophora species in Southeast Asia. In this study, we surveyed levels and patterns of genetic variation as well as population structure of two sympatric mangrove species, Rhizophora apiculata and R. mucronata in Thailand, using five nuclear genes and two cpDNA regions. In all investigated DNA regions, nucleotide variation within species was low, while nucleotide divergence between the two species was considerable. The nuclear genes evolved 10 times faster than the cpDNA regions. In both R. apiculata and R. mucronata, significant positive F(IS) values were found, indicating deviation from Hardy-Weinberg proportions and a deficiency of heterozygotes. In both species, we found significant genetic differentiation between populations. However, the pattern of population differentiation (F(ST)) of R. apiculata differed from that of R. mucronata. Our results suggest that the two investigated species have different demographic history, even though they are sympatric and have similar reproductive systems., Mexico is developing the basis for genomic medicine to improve healthcare of its population. The extensive study of genetic diversity and linkage disequilibrium structure of different populations has made it possible to develop tagging and imputation strategies to comprehensively analyze common genetic variation in association studies of complex diseases. We assessed the benefit of a Mexican haplotype map to improve identification of genes related to common diseases in the Mexican population. We evaluated genetic diversity, linkage disequilibrium patterns, and extent of haplotype sharing using genomewide data from Mexican Mestizos from regions with different histories of admixture and particular population dynamics. Ancestry was evaluated by including 1 Mexican Amerindian group and data from the HapMap. Our results provide evidence of genetic differences between Mexican subpopulations that should be considered in the design and analysis of association studies of complex diseases. In addition, these results support the notion that a haplotype map of the Mexican Mestizo population can reduce the number of tag SNPs required to characterize common genetic variation in this population. This is one of the first genomewide genotyping efforts of a recently admixed population in Latin America.)",BC3ACT,CLS
3,18.624808,7.035975,2.722033,"(The cell envelope is the target for many antibiotics. In Gram-positive bacteria, membrane alterations and dysfunction caused by antibiotics are sensed mainly by two classes of signal transduction systems: the ECF sigma factors and the two-component signal transduction systems (TCSs). Enduracidin is an antibiotic that inhibits the transglycosylation step of peptidoglycan biosynthesis, and is an attractive target for further antibiotic development studies. We assessed transcriptional responses to enduracidin in Bacillus subtilis cells using a high-density tiling chip, and compared the results with responses to bacitracin, which inhibits the lipid II cycle of peptidoglycan synthesis. We exploited the quantitative advantage of the tiling chip to introduce a new criterion, an increase in transcriptional level, in addition to the conventional induction ratio, in order to distinguish genes of biological significance from those with lower induction ratios. Our results indicate that introduction of the new criterion led to unambiguous identification of core transcriptional responses to antibiotics, with a reduction in the number of possible background genes, compared to previous results obtained using gene arrays. We identified 129 genes that were significantly upregulated by enduracidin and/or bacitracin. Notably, we found that inactivation of the LiaRS TCS, which was the system most strongly induced by the two antibiotics, resulted in increased sensitivity to enduracidin, probably through a failure to induce LiaIH proteins. We noted that 33 genes belonging to the SigM regulon were induced by both antibiotics. Consistent with stronger induction of the SigM regulon in enduracidin-treated cells, inactivation of sigM resulted in increased sensitivity to enduracidin. In addition, and for the first time, we found that the Spx regulon was induced in cells challenged by enduracidin and bacitracin, suggesting that thiol-oxidative stress occurred in cells treated with antibiotics. These findings contribute to further our understanding of the molecular nature of genetic systems involved in antibiotic resistance., Increased expression and activity of proteins driving cell cycle progression as well as inactivation of endogenous inhibitors of cyclin-dependent kinases (CDKs) enhance the proliferative potential of cells. Escape of cells during malignant transformation from the proper cell cycle control rendering them independent from growth factors provides rationale for therapeutic targeting of CDKs. Exposure of rapidly growing human MCF-7 breast cancer and HeLa cervix cancer cells to roscovitine (ROSC), a selective inhibitor of CDKs, inhibits their proliferation by induction of cell cycle arrest and/or apoptosis. The outcome strongly depends on the intrinsic traits of the tumor cells, on their cell cycle status prior to the onset of treatment and also on ROSC concentration. At lower dose ROSC primarily inhibits the cell cycle-related CDKs resulting in a strong cell cycle arrest. Interestingly, ROSC arrests asynchronously growing cells at the G(2)/M transition irrespective of the status of their restriction checkpoint. However, the exposure of cancer cells synchronized after serum starvation in the late G(1) phase results in a transient G(1) arrest only in cells displaying the intact G(1)/S checkpoint. At higher dosage ROSC triggers apoptosis. In HeLa cells inhibition of the activity of CDK7 and, in consequence, that of RNA polymerase II is a major event that facilitates the initiation of caspase-dependent apoptosis. In contrast, in the caspase-3-deficient MCF-7 breast cancer cells ROSC induces apoptosis by a p53-dependent pathway. HIPK2-mediated activation of the p53 transcription factor by phosphorylation at Ser46 results in upregulation of p53AIP1 protein. This protein after de novo synthesis and translocation into the mitochondria promotes depolarization of the mitochondrial membrane.)","(The cell envelope is the target for many antibiotics. In Gram-positive bacteria, membrane alterations and dysfunction caused by antibiotics are sensed mainly by two classes of signal transduction systems: the ECF sigma factors and the two-component signal transduction systems (TCSs). Enduracidin is an antibiotic that inhibits the transglycosylation step of peptidoglycan biosynthesis, and is an attractive target for further antibiotic development studies. We assessed transcriptional responses to enduracidin in Bacillus subtilis cells using a high-density tiling chip, and compared the results with responses to bacitracin, which inhibits the lipid II cycle of peptidoglycan synthesis. We exploited the quantitative advantage of the tiling chip to introduce a new criterion, an increase in transcriptional level, in addition to the conventional induction ratio, in order to distinguish genes of biological significance from those with lower induction ratios. Our results indicate that introduction of the new criterion led to unambiguous identification of core transcriptional responses to antibiotics, with a reduction in the number of possible background genes, compared to previous results obtained using gene arrays. We identified 129 genes that were significantly upregulated by enduracidin and/or bacitracin. Notably, we found that inactivation of the LiaRS TCS, which was the system most strongly induced by the two antibiotics, resulted in increased sensitivity to enduracidin, probably through a failure to induce LiaIH proteins. We noted that 33 genes belonging to the SigM regulon were induced by both antibiotics. Consistent with stronger induction of the SigM regulon in enduracidin-treated cells, inactivation of sigM resulted in increased sensitivity to enduracidin. In addition, and for the first time, we found that the Spx regulon was induced in cells challenged by enduracidin and bacitracin, suggesting that thiol-oxidative stress occurred in cells treated with antibiotics. These findings contribute to further our understanding of the molecular nature of genetic systems involved in antibiotic resistance., Two-component signal-transduction systems (TCSs) of bacteria are considered to form an intricate signal network to cope with various environmental stresses. One example of such a network in Escherichia coli is the signal transduction cascade from the EvgS/EvgA system to the PhoQ/PhoP system, where activation of the EvgS/EvgA system promotes expression of PhoP-activated genes. As a factor connecting this signal transduction cascade, we have identified a small inner membrane protein (65 aa), B1500. Expression of the b1500 gene is directly regulated by the EvgS/EvgA system, and b1500 expression from a heterologous promoter simultaneously activated the expression of mgtA and other PhoP regulon genes. This activation was PhoQ/PhoP-dependent and EvgS/EvgA-independent. Furthermore, deletion of b1500 from an EvgS-activated strain suppressed mgtA expression. B1500 is localized in the inner membrane, and bacterial two-hybrid data showed that B1500 formed a complex with the sensor PhoQ. These results indicate that the small membrane protein, B1500, connected the signal transduction between EvgS/EvgA and PhoQ/PhoP systems by directly interacting with PhoQ, thus activating the PhoQ/PhoP system.)","(The cell envelope is the target for many antibiotics. In Gram-positive bacteria, membrane alterations and dysfunction caused by antibiotics are sensed mainly by two classes of signal transduction systems: the ECF sigma factors and the two-component signal transduction systems (TCSs). Enduracidin is an antibiotic that inhibits the transglycosylation step of peptidoglycan biosynthesis, and is an attractive target for further antibiotic development studies. We assessed transcriptional responses to enduracidin in Bacillus subtilis cells using a high-density tiling chip, and compared the results with responses to bacitracin, which inhibits the lipid II cycle of peptidoglycan synthesis. We exploited the quantitative advantage of the tiling chip to introduce a new criterion, an increase in transcriptional level, in addition to the conventional induction ratio, in order to distinguish genes of biological significance from those with lower induction ratios. Our results indicate that introduction of the new criterion led to unambiguous identification of core transcriptional responses to antibiotics, with a reduction in the number of possible background genes, compared to previous results obtained using gene arrays. We identified 129 genes that were significantly upregulated by enduracidin and/or bacitracin. Notably, we found that inactivation of the LiaRS TCS, which was the system most strongly induced by the two antibiotics, resulted in increased sensitivity to enduracidin, probably through a failure to induce LiaIH proteins. We noted that 33 genes belonging to the SigM regulon were induced by both antibiotics. Consistent with stronger induction of the SigM regulon in enduracidin-treated cells, inactivation of sigM resulted in increased sensitivity to enduracidin. In addition, and for the first time, we found that the Spx regulon was induced in cells challenged by enduracidin and bacitracin, suggesting that thiol-oxidative stress occurred in cells treated with antibiotics. These findings contribute to further our understanding of the molecular nature of genetic systems involved in antibiotic resistance., Two-component signal-transduction systems (TCSs) of bacteria are considered to form an intricate signal network to cope with various environmental stresses. One example of such a network in Escherichia coli is the signal transduction cascade from the EvgS/EvgA system to the PhoQ/PhoP system, where activation of the EvgS/EvgA system promotes expression of PhoP-activated genes. As a factor connecting this signal transduction cascade, we have identified a small inner membrane protein (65 aa), B1500. Expression of the b1500 gene is directly regulated by the EvgS/EvgA system, and b1500 expression from a heterologous promoter simultaneously activated the expression of mgtA and other PhoP regulon genes. This activation was PhoQ/PhoP-dependent and EvgS/EvgA-independent. Furthermore, deletion of b1500 from an EvgS-activated strain suppressed mgtA expression. B1500 is localized in the inner membrane, and bacterial two-hybrid data showed that B1500 formed a complex with the sensor PhoQ. These results indicate that the small membrane protein, B1500, connected the signal transduction between EvgS/EvgA and PhoQ/PhoP systems by directly interacting with PhoQ, thus activating the PhoQ/PhoP system.)",BC3ACT,CLS
4,30.773328,3.112864,0.590549,"(Microscale technologies have emerged as a powerful tool for studying and manipulating biological systems and miniaturizing experiments. However, the lack of software complementing these techniques has made it difficult to apply them for many high-throughput experiments. This work establishes Arraycount, an approach to automatically count cells in microwell arrays. The procedure consists of fluorescent microscope imaging of cells that are seeded in microwells of a microarray system and then analyzing images via computer to recognize the array and count cells inside each microwell. To start counting, green and red fluorescent images (representing live and dead cells, respectively) are extracted from the original image and processed separately. A template-matching algorithm is proposed in which pre-defined well and cell templates are matched against the red and green images to locate microwells and cells. Subsequently, local maxima in the correlation maps are determined and local maxima maps are thresholded. At the end, the software records the cell counts for each detected microwell on the original image in high-throughput. The automated counting was shown to be accurate compared with manual counting, with a difference of approximately 1-2 cells per microwell: based on cell concentration, the absolute difference between manual and automatic counting measurements was 2.5-13%., Elimination of peripheral tumors by adoptively transferred tumor-specific T cells may require killing of cancer cells and tumor stromal cells. Tumor Ags are cross-presented on stromal cells, resulting in direct cytotoxic T cell (CTL) killing of both Ag-expressing cancer cells and stromal cells. Indirect killing of Ag loss variant cells also occurs. We show here that similar processes occur in a brain tumor stromal environment. We used murine cancer cell lines that express high or low levels of a peptide Ag, SIYRYYGL (SIY), recognized by transgenic 2C CD8(+) T cells. The two cell lines are killed with equivalent efficiency by 2C T cells in vitro. Following adoptive transfer of 2C T cells into mice with established SIY-Hi or SIY-Lo brain tumors, tumors of both types regressed, but low-Ag-expressing tumors recurred. High-Ag-expressing tumors contained CD11b(+) cells cross-presenting SIY peptide and were completely eliminated by 2C T cells. To further test the role of cross-presentation, RAG1(-/-) H-2(b) mice were infused with H-2(k) tumor cells expressing high levels of SIY peptide. Adoptively transferred 2C T cells are able to kill cross-presenting H-2(b) stromal cells but not H-2(k) tumor cells. In peripheral models, this paradigm led to a small static tumor. In the brain, activated 2C T cells were able to kill cross-presenting CD11b(+) cells and completely eliminate the H-2(k) tumors in most mice. Targeting brain tumor stroma or increasing Ag shedding from tumor cells to enhance cross-presentation may improve the clinical success of T cell adoptive therapies.)","(Microscale technologies have emerged as a powerful tool for studying and manipulating biological systems and miniaturizing experiments. However, the lack of software complementing these techniques has made it difficult to apply them for many high-throughput experiments. This work establishes Arraycount, an approach to automatically count cells in microwell arrays. The procedure consists of fluorescent microscope imaging of cells that are seeded in microwells of a microarray system and then analyzing images via computer to recognize the array and count cells inside each microwell. To start counting, green and red fluorescent images (representing live and dead cells, respectively) are extracted from the original image and processed separately. A template-matching algorithm is proposed in which pre-defined well and cell templates are matched against the red and green images to locate microwells and cells. Subsequently, local maxima in the correlation maps are determined and local maxima maps are thresholded. At the end, the software records the cell counts for each detected microwell on the original image in high-throughput. The automated counting was shown to be accurate compared with manual counting, with a difference of approximately 1-2 cells per microwell: based on cell concentration, the absolute difference between manual and automatic counting measurements was 2.5-13%., Current yeast interactome network maps contain several hundred molecular complexes with limited and somewhat controversial representation of direct binary interactions. We carried out a comparative quality assessment of current yeast interactome data sets, demonstrating that high-throughput yeast two-hybrid (Y2H) screening provides high-quality binary interaction information. Because a large fraction of the yeast binary interactome remains to be mapped, we developed an empirically controlled mapping framework to produce a &quot;second-generation&quot; high-quality, high-throughput Y2H data set covering approximately 20% of all yeast binary interactions. Both Y2H and affinity purification followed by mass spectrometry (AP/MS) data are of equally high quality but of a fundamentally different and complementary nature, resulting in networks with different topological and biological properties. Compared to co-complex interactome models, this binary map is enriched for transient signaling interactions and intercomplex connections with a highly significant clustering between essential proteins. Rather than correlating with essentiality, protein connectivity correlates with genetic pleiotropy.)","(Microscale technologies have emerged as a powerful tool for studying and manipulating biological systems and miniaturizing experiments. However, the lack of software complementing these techniques has made it difficult to apply them for many high-throughput experiments. This work establishes Arraycount, an approach to automatically count cells in microwell arrays. The procedure consists of fluorescent microscope imaging of cells that are seeded in microwells of a microarray system and then analyzing images via computer to recognize the array and count cells inside each microwell. To start counting, green and red fluorescent images (representing live and dead cells, respectively) are extracted from the original image and processed separately. A template-matching algorithm is proposed in which pre-defined well and cell templates are matched against the red and green images to locate microwells and cells. Subsequently, local maxima in the correlation maps are determined and local maxima maps are thresholded. At the end, the software records the cell counts for each detected microwell on the original image in high-throughput. The automated counting was shown to be accurate compared with manual counting, with a difference of approximately 1-2 cells per microwell: based on cell concentration, the absolute difference between manual and automatic counting measurements was 2.5-13%., BACKGROUND: Flow cytometry technology is widely used in both health care and research. The rapid expansion of flow cytometry applications has outpaced the development of data storage and analysis tools. Collaborative efforts being taken to eliminate this gap include building common vocabularies and ontologies, designing generic data models, and defining data exchange formats. The Minimum Information about a Flow Cytometry Experiment (MIFlowCyt) standard was recently adopted by the International Society for Advancement of Cytometry. This standard guides researchers on the information that should be included in peer reviewed publications, but it is insufficient for data exchange and integration between computational systems. The Functional Genomics Experiment (FuGE) formalizes common aspects of comprehensive and high throughput experiments across different biological technologies. We have extended FuGE object model to accommodate flow cytometry data and metadata. METHODS: We used the MagicDraw modelling tool to design a UML model (Flow-OM) according to the FuGE extension guidelines and the AndroMDA toolkit to transform the model to a markup language (Flow-ML). We mapped each MIFlowCyt term to either an existing FuGE class or to a new FuGEFlow class. The development environment was validated by comparing the official FuGE XSD to the schema we generated from the FuGE object model using our configuration. After the Flow-OM model was completed, the final version of the Flow-ML was generated and validated against an example MIFlowCyt compliant experiment description. RESULTS: The extension of FuGE for flow cytometry has resulted in a generic FuGE-compliant data model (FuGEFlow), which accommodates and links together all information required by MIFlowCyt. The FuGEFlow model can be used to build software and databases using FuGE software toolkits to facilitate automated exchange and manipulation of potentially large flow cytometry experimental data sets. Additional project documentation, including reusable design patterns and a guide for setting up a development environment, was contributed back to the FuGE project. CONCLUSION: We have shown that an extension of FuGE can be used to transform minimum information requirements in natural language to markup language in XML. Extending FuGE required significant effort, but in our experiences the benefits outweighed the costs. The FuGEFlow is expected to play a central role in describing flow cytometry experiments and ultimately facilitating data exchange including public flow cytometry repositories currently under development.)",BC3ACT,CLS


In [35]:
bc3_act_test_df = bc3actrun.load(bc3_act_test_file)
df_bc3_act["test_label"] =bc3actrun.get_labels(bc3_act_test_df, bc3_act_test_eval_file)

In [36]:
assert bc3_act_test_df["abstract"].tolist()==df_bc3_act["Unigram_detail"].apply(lambda x: x[0]).tolist()

In [37]:
df_bc3_act.groupby("test_label").describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
test_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,count,5090.0,5090.0,5090.0
0,mean,26.310835,6.696011,1.725549
0,std,9.248763,5.347394,1.716761
0,min,6.27761,0.0,0.0
0,25%,19.703812,3.289165,0.787847
0,50%,25.162518,5.073287,1.389157
0,75%,31.525002,8.285161,2.268703
0,max,75.011317,41.754463,18.708287
1,count,910.0,910.0,910.0
1,mean,29.274148,8.092841,2.263471


In [38]:
print( df_bc3_act.groupby("test_label").describe().stack().
      to_latex(float_format=lambda x: "{:.2f}".format(x)))

\begin{tabular}{llrrr}
\toprule
  &     &  Unigram &  Bigram &  Trigram \\
test\_label & {} &          &         &          \\
\midrule
0 & count & 5090.00 & 5090.00 & 5090.00 \\
  & mean & 26.31 & 6.70 & 1.73 \\
  & std & 9.25 & 5.35 & 1.72 \\
  & min & 6.28 & 0.00 & 0.00 \\
  & 25\% & 19.70 & 3.29 & 0.79 \\
  & 50\% & 25.16 & 5.07 & 1.39 \\
  & 75\% & 31.53 & 8.29 & 2.27 \\
  & max & 75.01 & 41.75 & 18.71 \\
1 & count & 910.00 & 910.00 & 910.00 \\
  & mean & 29.27 & 8.09 & 2.26 \\
  & std & 9.36 & 6.00 & 1.73 \\
  & min & 11.14 & 1.52 & 0.00 \\
  & 25\% & 22.69 & 4.51 & 1.17 \\
  & 50\% & 28.31 & 6.25 & 1.88 \\
  & 75\% & 34.32 & 9.38 & 2.84 \\
  & max & 74.01 & 51.20 & 18.97 \\
\bottomrule
\end{tabular}



In [39]:
bc3_act_train_df = bc3actrun.load(bc3_act_train_file)
bc3_act_train_df["train_labels"] = bc3actrun.get_labels(bc3_act_train_df, bc3_act_train_eval_file)

In [40]:
bc3_act_train_df.groupby(["train_labels"]).count()

Unnamed: 0_level_0,title,abstract,id
train_labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1140,1140,1140
1,1140,1140,1140


## Chemu

In [41]:
chemu_train_dir = os.path.join("tmp", "chemu", "train")
chemu_test_dir = os.path.join("tmp", "chemu", "test")

In [42]:
from chemu_gene_mention import ChemuGeneMention

In [43]:

chemurun = ChemuGeneMention()
result_score, result_detail = chemurun.run_similarity_comparer( chemu_train_dir,  chemu_test_dir, "text")
df_chemu_text = scores_to_df(result_score,result_detail, "Chemu", "text", "NER")

result_score, result_detail = chemurun.run_similarity_comparer( chemu_train_dir,  chemu_test_dir, "entity")
df_chemu_eval = scores_to_df(result_score,result_detail, "Chemu", "anno", "NER")

df_chemu = pd.concat([df_chemu_text, df_chemu_eval])

In [44]:
df_chemu.head()

Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
0,88.16057,73.775566,66.167284,"(Example 55\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopentyl-4-methyl-1,3-thiazole-5-carboxylic acid (0.095 g, 0.448 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed tube was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopentyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)thiazole-5-carboxamide (0.143 g, 80%) as a beige solid., Example 53\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopropyl-4-methyl-1,3-oxazole-5-carboxylic acid (0.072 g, 0.428 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopropyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)oxazole-5-carboxamide (0.143 g, 89%) as a beige solid.)","(Example 55\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopentyl-4-methyl-1,3-thiazole-5-carboxylic acid (0.095 g, 0.448 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed tube was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopentyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)thiazole-5-carboxamide (0.143 g, 80%) as a beige solid., Example 53\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopropyl-4-methyl-1,3-oxazole-5-carboxylic acid (0.072 g, 0.428 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopropyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)oxazole-5-carboxamide (0.143 g, 89%) as a beige solid.)","(Example 55\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopentyl-4-methyl-1,3-thiazole-5-carboxylic acid (0.095 g, 0.448 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed tube was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopentyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)thiazole-5-carboxamide (0.143 g, 80%) as a beige solid., Example 53\nTo a suspension of 6-(2-amino-5-chlorothiazol-4-yl)-3,4-dihydroquinolin-2(1H)-one hydrochloride (0.100 g, 0.408 mmol) and 2-cyclopropyl-4-methyl-1,3-oxazole-5-carboxylic acid (0.072 g, 0.428 mmol), and pyridine (0.15 mL, 1.831 mmol) in acetonitrile (4 mL) in a sealed was added propylphosphonic anhydride solution (50 wt % in ethyl acetate, 0.85 mL, 1.431 mmol). The sealed tube was heated to 50° C. for 5 days and the precipitation formed. After cooling, the precipitate was collected by filtration and washed with cold 1:1 acetonitrile/water to give 2-cyclopropyl-4-methyl-N-(4-(2-oxo-1,2,3,4-tetrahydroquinolin-6-yl)thiazol-2-yl)oxazole-5-carboxamide (0.143 g, 89%) as a beige solid.)",Chemu text,NER
1,59.43823,33.936262,19.367281,"(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate\nTert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-hydroxyltetrahydro-2H-pyran-3-ylcarbamate (2.33 g, 7.08 mmol) was dissolved in a mixed solution of 24 mL acetonitrile, 4 mL water and 4 mL acetic acid. Thereto was added an aqueous solution (4 mL) of ruthenium chloride hydrate (3.7 mg, 0.0142 mmol), and cooled to 0 °C. Sodium bromate (535 mg, 3.54 mmol) was added, and stirred at low temperature for about 1.5 hours until the raw materials were completely reacted. To the reaction solution was added 120 mL water, stirred at 0 °C overnight, and extracted with dichloromethane. The organic phase was washed with water, dried and concentrated, and then the residue was purified by silica gel column chromatography (petroleum ether/ethyl acetate, 10:1) to give the intermediate 1 tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate (1.71 g) as a white solid in 74% yield., Example 1: (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine -1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-amine\nStep1: tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2] thiazine-1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate\nTo N,N-dimethylacetamide (2 mL) were added tert-butyl (2R,3S)-2-(2,5-di-fluorophenyl) -5-oxotetrahydro-2H-pyran-3-ylcarbamate (48 mg, 0.146 mmol), 2,3,4,5,6,7-hexahydropyrrolo [3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide p-toluenesulfonate (48 mg, 0.13 mmol) and triethylamine (10 mg, 0.1 mmol), and stirred at room temperature for 3 hours. After cooling in an ice bath, sodium triacetoxyborohydride (87 mg, 0.39 mmol) was added, slowly warmed to room temperature and stirred overnight. Saturated sodium bicarbonate solution was added, and the resulting mixture was extracted with dichloromethane, washed with saturated brine, dried, concentrated and then purified by silica gel column chromatography to give tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide-6-yl) -2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate (48 mg). Yield: 71%.)","(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate\nTert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-hydroxyltetrahydro-2H-pyran-3-ylcarbamate (2.33 g, 7.08 mmol) was dissolved in a mixed solution of 24 mL acetonitrile, 4 mL water and 4 mL acetic acid. Thereto was added an aqueous solution (4 mL) of ruthenium chloride hydrate (3.7 mg, 0.0142 mmol), and cooled to 0 °C. Sodium bromate (535 mg, 3.54 mmol) was added, and stirred at low temperature for about 1.5 hours until the raw materials were completely reacted. To the reaction solution was added 120 mL water, stirred at 0 °C overnight, and extracted with dichloromethane. The organic phase was washed with water, dried and concentrated, and then the residue was purified by silica gel column chromatography (petroleum ether/ethyl acetate, 10:1) to give the intermediate 1 tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate (1.71 g) as a white solid in 74% yield., Example 1: (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine -1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-amine\nStep1: tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2] thiazine-1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate\nTo N,N-dimethylacetamide (2 mL) were added tert-butyl (2R,3S)-2-(2,5-di-fluorophenyl) -5-oxotetrahydro-2H-pyran-3-ylcarbamate (48 mg, 0.146 mmol), 2,3,4,5,6,7-hexahydropyrrolo [3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide p-toluenesulfonate (48 mg, 0.13 mmol) and triethylamine (10 mg, 0.1 mmol), and stirred at room temperature for 3 hours. After cooling in an ice bath, sodium triacetoxyborohydride (87 mg, 0.39 mmol) was added, slowly warmed to room temperature and stirred overnight. Saturated sodium bicarbonate solution was added, and the resulting mixture was extracted with dichloromethane, washed with saturated brine, dried, concentrated and then purified by silica gel column chromatography to give tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide-6-yl) -2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate (48 mg). Yield: 71%.)","(Step 9: tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate\nTert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-hydroxyltetrahydro-2H-pyran-3-ylcarbamate (2.33 g, 7.08 mmol) was dissolved in a mixed solution of 24 mL acetonitrile, 4 mL water and 4 mL acetic acid. Thereto was added an aqueous solution (4 mL) of ruthenium chloride hydrate (3.7 mg, 0.0142 mmol), and cooled to 0 °C. Sodium bromate (535 mg, 3.54 mmol) was added, and stirred at low temperature for about 1.5 hours until the raw materials were completely reacted. To the reaction solution was added 120 mL water, stirred at 0 °C overnight, and extracted with dichloromethane. The organic phase was washed with water, dried and concentrated, and then the residue was purified by silica gel column chromatography (petroleum ether/ethyl acetate, 10:1) to give the intermediate 1 tert-butyl (2R,3S)-2-(2,5-difluorophenyl)-5-oxotetrahydro-2H-pyran-3-ylcarbamate (1.71 g) as a white solid in 74% yield., Example 1: (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine -1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-amine\nStep1: tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2] thiazine-1,1-dioxide-6-yl)-2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate\nTo N,N-dimethylacetamide (2 mL) were added tert-butyl (2R,3S)-2-(2,5-di-fluorophenyl) -5-oxotetrahydro-2H-pyran-3-ylcarbamate (48 mg, 0.146 mmol), 2,3,4,5,6,7-hexahydropyrrolo [3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide p-toluenesulfonate (48 mg, 0.13 mmol) and triethylamine (10 mg, 0.1 mmol), and stirred at room temperature for 3 hours. After cooling in an ice bath, sodium triacetoxyborohydride (87 mg, 0.39 mmol) was added, slowly warmed to room temperature and stirred overnight. Saturated sodium bicarbonate solution was added, and the resulting mixture was extracted with dichloromethane, washed with saturated brine, dried, concentrated and then purified by silica gel column chromatography to give tert-butyl (2R,3S,5R)-5-(2,3,4,5,6,7-hexahydropyrrolo[3',4':3,4]pyrazolo[1,5-b][1,2]thiazine-1,1-dioxide-6-yl) -2-(2,5-difluorophenyl)tetrahydro-2H-pyran-3-ylcarbamate (48 mg). Yield: 71%.)",Chemu text,NER
2,79.425707,52.160029,46.756433,"(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine\nDMF (1 mL) and DIPEA (0.022 ml, 0.13 mmol) were added sequentially to a flask charged with N-(azetidin-3-yl)-5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-4-methoxypyridin-2-amine (22 mg, 0.050 mmol). 1-Fluoro-3-iodopropane (9 mg, 0.05 mmol) in DMF (0.1 mL) was then added. After 2 hours, the reaction was diluted with saturated aqueous sodium chloride, and the mixture was extracted in EtOAC (3×). The combined organic layers were washed with water and dried over sodium sulfate, filtered and concentrated under reduced pressure. The resulting crude film was purified by flash silica chromatography, elution gradient 2 to 10% MeOH in DCM, to give 5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine (9.0 mg, 36%) as a dry film., Example 19\nPreparation of N-(4-((6S,8R)-7-(2,2-difluoropropyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine\nDMF (2 mL) and DIPEA (0.074 mL, 0.42 mmol) were added sequentially to a flask charged with N-(4-((6S,8R)-7-(2,2-difluoropropyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)azetidin-3-amine (75 mg, 0.17 mmol). 1-Fluoro-3-iodopropane (31.9 mg, 0.17 mmol) in DMF (0.1 mL) was then added, and stirring was continued for 2 hours. The reaction was stopped, diluted with saturated aqueous sodium chloride and the compound was extracted in EtOAC (×3). The combined extracts were washed with water and dried over sodium sulfate, filtered and concentrated under reduced pressure to afford a film. This material was purified by flash silica chromatography, eluting with 2 to 10% (methanol containing 1% ammonium hydroxide) in DCM to afford N-(4-((6S,8R)-7-(2,2-difluoropropyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine (43 mg, 51%).)","(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine\nDMF (1 mL) and DIPEA (0.022 ml, 0.13 mmol) were added sequentially to a flask charged with N-(azetidin-3-yl)-5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-4-methoxypyridin-2-amine (22 mg, 0.050 mmol). 1-Fluoro-3-iodopropane (9 mg, 0.05 mmol) in DMF (0.1 mL) was then added. After 2 hours, the reaction was diluted with saturated aqueous sodium chloride, and the mixture was extracted in EtOAC (3×). The combined organic layers were washed with water and dried over sodium sulfate, filtered and concentrated under reduced pressure. The resulting crude film was purified by flash silica chromatography, elution gradient 2 to 10% MeOH in DCM, to give 5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine (9.0 mg, 36%) as a dry film., Example 6\nN-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine\n1-Fluoro-3-iodopropane (40.9 μL, 0.39 mmol) was added to a solution of N-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)azetidin-3-amine (169 mg, 0.39 mmol) and DIPEA (203 μL, 1.16 mmol) in NMP (1.7 mL) at room temperature. After 18 hours, the reaction was concentrated under reduced pressure. The resulting residue was purified by reverse phase flash chromatography (C18), eluting with decreasingly polar mixtures of water (containing 0.2% ammonium hydroxide) and MeCN as eluents. Product fractions were combined and lyopholized to afford N-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine (75 mg, 39%) as clear residue.)","(Example 9\n5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine\nDMF (1 mL) and DIPEA (0.022 ml, 0.13 mmol) were added sequentially to a flask charged with N-(azetidin-3-yl)-5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-4-methoxypyridin-2-amine (22 mg, 0.050 mmol). 1-Fluoro-3-iodopropane (9 mg, 0.05 mmol) in DMF (0.1 mL) was then added. After 2 hours, the reaction was diluted with saturated aqueous sodium chloride, and the mixture was extracted in EtOAC (3×). The combined organic layers were washed with water and dried over sodium sulfate, filtered and concentrated under reduced pressure. The resulting crude film was purified by flash silica chromatography, elution gradient 2 to 10% MeOH in DCM, to give 5-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-N-(1-(3-fluoropropyl)azetidin-3-yl)-4-methoxypyridin-2-amine (9.0 mg, 36%) as a dry film., Example 6\nN-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine\n1-Fluoro-3-iodopropane (40.9 μL, 0.39 mmol) was added to a solution of N-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)azetidin-3-amine (169 mg, 0.39 mmol) and DIPEA (203 μL, 1.16 mmol) in NMP (1.7 mL) at room temperature. After 18 hours, the reaction was concentrated under reduced pressure. The resulting residue was purified by reverse phase flash chromatography (C18), eluting with decreasingly polar mixtures of water (containing 0.2% ammonium hydroxide) and MeCN as eluents. Product fractions were combined and lyopholized to afford N-(4-((6S,8R)-7-((1-fluorocyclopropyl)methyl)-8-methyl-6,7,8,9-tetrahydro-3H-pyrazolo[4,3-f]isoquinolin-6-yl)-3-methoxyphenyl)-1-(3-fluoropropyl)azetidin-3-amine (75 mg, 39%) as clear residue.)",Chemu text,NER
3,50.937163,13.844644,7.317617,"(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-naphthyridin-1 (4H)-yl)benzonitrile (Intermediate 12)\nStep 1: 3,5-Difluoro-N-methoxy-N-methylisonicotinamide\n3,5-difluoroisonicotinic acid (18.78 g, 118 mmol), N,O-dimethylhydroxylamine hydrochloride (12.09 g, 124 mmol), HATU (47.1 g, 124 mmol) and DIPEA (61.9 ml, 354 mmol) were suspened in DCM (236 ml). The mixture was allowed to stir at room temperature for 24 hours. The mixture was concentrated under reduced pressure. The residue was extracted with EtOAc washing with saturated aqueous NH4Cl solution (2×), water, saturated NaHCO3 and brine. The organic layer was dried over sodium sulfate and concentrated. The crude was purified by chromatography on silica gel using 0-75% EtOAc in heptanes as eluent to give the title compound (18.9 g, 79% yield)., The second step\nTo a solution of Compound 36 (1.14g, 1.97mmol) in DMF (12 mL) were added cesium carbonate (1.29 g, 3.95 mmol) and 1,2-dibromoethane (0.255 mL, 2.96 mmol), and the mixture was stirred at room temperature for 18.5 hours. The reaction mixture was extracted with ethyl acetate after addition of water. The organic layer was washed with water and saturated brine, dried over anhydrous magnesium sulfate, and concentrated under reduced pressure. The resulting residue was purified by silica gel chromatography (hexane-ethyl acetate) to yield Compound 37 (1.13g, yield 95%).)","(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-naphthyridin-1 (4H)-yl)benzonitrile (Intermediate 12)\nStep 1: 3,5-Difluoro-N-methoxy-N-methylisonicotinamide\n3,5-difluoroisonicotinic acid (18.78 g, 118 mmol), N,O-dimethylhydroxylamine hydrochloride (12.09 g, 124 mmol), HATU (47.1 g, 124 mmol) and DIPEA (61.9 ml, 354 mmol) were suspened in DCM (236 ml). The mixture was allowed to stir at room temperature for 24 hours. The mixture was concentrated under reduced pressure. The residue was extracted with EtOAc washing with saturated aqueous NH4Cl solution (2×), water, saturated NaHCO3 and brine. The organic layer was dried over sodium sulfate and concentrated. The crude was purified by chromatography on silica gel using 0-75% EtOAc in heptanes as eluent to give the title compound (18.9 g, 79% yield)., (Example 12)\nThe first step\nTo a solution of Compound 58 (850mg, 1.91mmol) and 3-bromoprop-1-yne (455mg, 3.82mmol) in DMF (5mL) was added 60% sodium hydride (153mg, 3.82mmol) under ice-cooling, and the mixture was stirred for 1 hour at 0 °C. After the reaction mixture was diluted with ethyl acetate, water was added thereto, and the mixture was extracted with ethyl acetate. The organic layer was washed with water, dried over anhydrous sodium sulfate, and concentrated under reduced pressure. The resulting residue was dissolved in ethanol (20 mL) and water (3 mL), ammonium chloride (1.02 g, 19.1 mmol) and iron powder (1.07 g, 19.1 mmol) was added thereto, and the mixture was stirred for 4 hours at 90 °C. After the insoluble materials were removed by filtration, and the filtrate was concentrated under reduced pressure. The resulting residue was extracted with chloroform after addition of chloroform and water. The organic layer was dried over anhydrous sodium sulfate, and concentrated under reduced pressure. The resulting residue was purified by silica gel chromatography (hexane-ethyl acetate) to yield Compound 59 (251 mg, yield 29%).)","(3,5-Dichloro-4-(5-fluoro-2-methyl-4-oxo-1,7-naphthyridin-1 (4H)-yl)benzonitrile (Intermediate 12)\nStep 1: 3,5-Difluoro-N-methoxy-N-methylisonicotinamide\n3,5-difluoroisonicotinic acid (18.78 g, 118 mmol), N,O-dimethylhydroxylamine hydrochloride (12.09 g, 124 mmol), HATU (47.1 g, 124 mmol) and DIPEA (61.9 ml, 354 mmol) were suspened in DCM (236 ml). The mixture was allowed to stir at room temperature for 24 hours. The mixture was concentrated under reduced pressure. The residue was extracted with EtOAc washing with saturated aqueous NH4Cl solution (2×), water, saturated NaHCO3 and brine. The organic layer was dried over sodium sulfate and concentrated. The crude was purified by chromatography on silica gel using 0-75% EtOAc in heptanes as eluent to give the title compound (18.9 g, 79% yield)., Synthesis of Compound 13\nTo a mixture of compound 12 (0.45 g, 4.2 mol) and Et3N (8 mL, 0.06 mol) in THF (15 mL), a solution of 2-(tert-Butoxycarbonyloxyimino)-2-phenylacetonitrile (2.1 g, 8.3 mol) in THF (30 mL) was added dropwise at 0° C. Following complete addition, the solution was allowed to warm to room temperature and left to stir for 4 hours. The reaction mixture was concentrated to oil under reduced pressure and CH2Cl2 (50 mL) was added. The mixture was washed with sodium hydroxide (5%, 30 mL) and brine (30 mL). The organic layer was dried over sodium sulfate and concentrated under reduced pressure. The crude product was purified by column chromatography (silica gel, MeOH:CH2Cl2=1:10, v/v) to give compound 13 as a yellow oil (803.1 mg, 60.7%). LC-MS m/z (ES+), 304.22 (M+H)+.)",Chemu text,NER
4,57.138872,18.352924,5.902813,"(Example 73\nPreparation of 2-chloro-3-fluoro-5-nitrobenzoic acid (C206)\nTo a suspension of 2-chloro-3-fluorobenzoic acid (10 g, 57.5 mmol) in concentrated sulfuric acid (62 mL, 1149.5 mmol) was added concentrated nitric acid (4 mL, 86.2 mmol) dropwise at −10° C. and the reaction mixture was stirred between −10° C. and 0° C. for 3 hours. The reaction mixture was slowly poured into a beaker of crushed ice (˜1 L), and the resulting precipitated solid was filtered and washed with water (100 mL). The crude product was recrystallized from hot water to afford the title compound as an off-white solid (7.5 g, 60%): mp 163-165° C.;, Step D: 7-Chloro-6-(difluoromethyl)-4-methyl-2-(trifluoromethyl)pyrazolo[1,5-c]pyridine-3-carboxylic acid. To a solution of ethyl 7-chloro-6-(difluoromethyl)-4-methyl-2-(trifluoromethyl)pyrazolo[1,5-a]pyridine-3-carboxylate (2.6 g, 7.29 mmol) in a mixture of 1,4-dioxane (30 mL) and water (6 mL) was added lithium hydroxide monohydrate (459 mg, 10.9 mmol) and the reaction mixture was stirred at 60° C. for 4 h. To the reaction mixture was added lithium hydroxide monohydrate (459 mg, 10.9 mmol) and the reaction mixture was stirred at 60° C. for 18 h. To the reaction mixture was added lithium hydroxide monohydrate (459 mg, 10.9 mmol) and the reaction mixture was stirred at 60° C. for 2 h. To the reaction mixture was added a fourth portion of lithium hydroxide monohydrate (459 mg, 10.9 mmol) and the reaction mixture was stirred at 60° C. for 2 h. The reaction mixture was concentrated to 6 mL under vacuum. The residue was diluted with water (20 mL) and the mixture was washed with chloroform (1×10 mL). The aqueous layer was acidified to pH 4 by addition of 1 M hydrochloric acid. The precipitate was collected and the solid was washed with water (2×20 mL) to give the title compound (1.80 g, 5.48 mmol, 75%) as a white crystalline solid.)","(Example 73\nPreparation of 2-chloro-3-fluoro-5-nitrobenzoic acid (C206)\nTo a suspension of 2-chloro-3-fluorobenzoic acid (10 g, 57.5 mmol) in concentrated sulfuric acid (62 mL, 1149.5 mmol) was added concentrated nitric acid (4 mL, 86.2 mmol) dropwise at −10° C. and the reaction mixture was stirred between −10° C. and 0° C. for 3 hours. The reaction mixture was slowly poured into a beaker of crushed ice (˜1 L), and the resulting precipitated solid was filtered and washed with water (100 mL). The crude product was recrystallized from hot water to afford the title compound as an off-white solid (7.5 g, 60%): mp 163-165° C.;, Synthesis of Compound A-1\nA solution of N-phenyl-[1,1'-biphenyl]-2-amine (0.64 g, 2.61 mmol) in 1,2-dichlorobenzene (5 mL) was bubbled with nitrogen gas for 30 mins, boron tribromide (1M in dichloromethane, 4 mL, 4.00 mmol) was added and the reaction mixture was heated to 180° C. for 3 hours. After cooling the reaction mixture to room temperature (22° C.), diphenylzine (1.27 g, 5.78 mmol) was added to the reaction mixture inside the glovebox. The reaction mixture was heated to 120° C. for 6 hours. The reaction mixture was cooled to room temperature (22° C.), water was added and the reaction mixture was extracted with diethyl ether (×3). The combined organic layer was washed with water and brine solution in this sequence, dried over MgSO4 and concentrated in vacuo. The residue was purified by flash column chromatography with dichloromethane-hexane (1:9 to 2:8) to give Compound A-1 (0.68 g, 80% yield) as white solid.)","(Example 73\nPreparation of 2-chloro-3-fluoro-5-nitrobenzoic acid (C206)\nTo a suspension of 2-chloro-3-fluorobenzoic acid (10 g, 57.5 mmol) in concentrated sulfuric acid (62 mL, 1149.5 mmol) was added concentrated nitric acid (4 mL, 86.2 mmol) dropwise at −10° C. and the reaction mixture was stirred between −10° C. and 0° C. for 3 hours. The reaction mixture was slowly poured into a beaker of crushed ice (˜1 L), and the resulting precipitated solid was filtered and washed with water (100 mL). The crude product was recrystallized from hot water to afford the title compound as an off-white solid (7.5 g, 60%): mp 163-165° C.;, Intermediate 1: 5-Bromo-7-chloroindolin-2-one\nTo a cooled (0° C.) solution of 7-chloroindolin-2-one (1.0 g, 6.0 mmol) in TFA (11 mL) was added N-bromosuccinimide (1.0 g, 6.0 mmol) in portions. The reaction mixture was stirred at 0° C. for 6 h. The solvent was removed in vacuo and the residue was diluted with DCM (25 mL) and concentrated, followed by a similar sequence with EtOAc. The crude product was triturated with ethanol to provide the title compound as a white solid (861 mg, 58% yield).)",Chemu text,NER


In [45]:
print(*df_chemu_text.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

To a solution of 5-Bromo-4-methoxy-pyridin-2-ylamine (53 g, 0.261 mol, 1.0 eq.) in EtOH:H2O=4:1 (500 mL) is added chloro-acetaldehyde (24.589 g, 0.313 mol, 1.2 eq.), then NaHCO3 (26.3 g, 0.313 mol, 1.2 eq.) is added. The resultant mixture is heated to 90° C. for 4 h. After cooling to r.t., the organic solvent is evaporated. The residue is extracted with DCM (200 mL×3). The organic layer are combined, dried over Na2SO4, filtered and concentrated. The crude product is purified by silica gel chromatography (DCM:MeOH=50:1) to afford compound 6-Bromo-7-methoxy-imidazo[1,2-a]pyridine (39 g, 66%) as a brown solid.

Step 1. N-(5-bromo-2-fluoropyridin-3-yl)ethanesulfonamide
To a solution of 5-bromo-2-fluoropyridin-3-amine (3 g, 20 mmol) in pyridine (10 mL) and DCM (20 mL) at room temperature was added ethanesulfonyl chloride (2.2 mL, 24 mmol). After stirring for 30 min, the solvent was evaporated. The resulting residue was diluted with MeOH (4 mL) and partitioned between EtOAc and brine. The or

In [46]:
print(*df_chemu_text.sort_values(by=["Unigram"], ascending=False).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

EXAMPLE 6
5-(3-methoxyphenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (reaction scheme 1, compound 6)
Similarly to example 1.8, starting from 71 mg (0.5 mmol) of 3-methoxyphenylboronic acid and 150 mg (0.3 mmol) of 5-bromo-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (prepared as described in example 2.5), 120 mg (77%) of 5-(3-methoxyphenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione is obtained in the form of a white solid with a melting point of 251° C.

EXAMPLE 4
5-(3-fluorophenyl)-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quinazolin-3-yl)-piperidin-1-yl]-ethyl}-1H-pyrimidine-2,4-dione (reaction scheme 1, compound 4)
Similarly to example 1.8, starting from 66 mg (0.5 mmol) of 3-fluorophenylboronic acid and 150 mg (0.3 mmol) of 5-bromo-3-methyl-1-{2-oxo-2-[4-(2-oxo-1,4-dihydro-2H-quina

## Summary

In [47]:
df_summary = pd.concat([df_sst2, df_aimed_unique,df_aimed_random, df_bc2_gm, df_bc3_act,df_chemu ])

In [48]:
df_summary.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram,test_label
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AIMED (R),count,584.000000,584.000000,584.000000,0.0
AIMED (R),mean,96.947936,82.294414,73.146194,
AIMED (R),std,11.921198,18.519621,21.285885,
AIMED (R),min,27.498597,8.247861,0.000000,
AIMED (R),25%,100.000000,77.777778,66.666667,
...,...,...,...,...,...
SST2,min,0.000000,0.000000,0.000000,
SST2,25%,36.514837,0.000000,0.000000,
SST2,50%,43.643578,16.666667,0.000000,
SST2,75%,53.452248,30.151134,0.000000,


In [49]:
df_summary.groupby(["task_type","data_set"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram,test_label
task_type,data_set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CLS,BC3ACT,26.760271,6.907863,1.807134,0.151667
CLS,SST2,46.055815,17.378379,1.388049,
NER,BC2GM anno,70.766285,19.552406,5.41172,
NER,BC2GM text,33.19164,13.122176,4.200074,
NER,Chemu anno,84.286901,30.670974,6.832106,
NER,Chemu text,68.446277,42.391903,31.631991,
REL,AIMED (R),96.947936,82.294414,73.146194,
REL,AIMED (U),67.137026,36.067096,20.770509,


In [50]:
print(df_summary.groupby([ "data_set","task_type"]).mean().
      to_latex(float_format=lambda x: "{:.2f}".format(x)))

\begin{tabular}{llrrrr}
\toprule
      &     &  Unigram &  Bigram &  Trigram &  test\_label \\
data\_set & task\_type &          &         &          &             \\
\midrule
AIMED (R)  & REL & 96.95 & 82.29 & 73.15 & nan \\
AIMED (U)  & REL & 67.14 & 36.07 & 20.77 & nan \\
BC2GM anno & NER & 70.77 & 19.55 & 5.41 & nan \\
BC2GM text & NER & 33.19 & 13.12 & 4.20 & nan \\
BC3ACT  & CLS & 26.76 & 6.91 & 1.81 & 0.15 \\
Chemu anno & NER & 84.29 & 30.67 & 6.83 & nan \\
Chemu text & NER & 68.45 & 42.39 & 31.63 & nan \\
SST2  & CLS & 46.06 & 17.38 & 1.39 & nan \\
\bottomrule
\end{tabular}

