# Similarity comparison 

In [1]:
import os,sys, tempfile, pandas as pd
sys.path.append("src")

In [2]:
working_dir =  "tempv2"
os.makedirs(working_dir, exist_ok=True)

In [3]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [4]:
bc2gm_train_file = os.path.join("tmp", "train.in")
bc2gm_test_file = os.path.join("tmp", "test.in")

bc2gm_train_eval_file = os.path.join("tmp", "trainGENE.eval")
bc2gm_test_eval_file = os.path.join("tmp", "testGENE.eval")

In [5]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()
result_score, result_detail = bc2gmrun.run_similarity_comparer("text", bc2gm_train_file,  bc2gm_test_file)
df_bc2_gm_text = scores_to_df(result_score,result_detail, "BC2GM", "text", "NER")

result_score, result_detail = bc2gmrun.run_similarity_comparer("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file)
df_bc2_gm_eval = scores_to_df(result_score,result_detail, "BC2GM", "anno", "NER")

df_bc2_gm = pd.concat([df_bc2_gm_text, df_bc2_gm_eval])

Exact matches Unigram, 39 / 5000
Exact matches Bigram, 26 / 5000
Exact matches Trigram, 26 / 5000
Exact matches Unigram, 1998 / 6331
Exact matches Bigram, 544 / 6331
Exact matches Trigram, 135 / 6331


In [6]:
df_bc2_gm.sample(n=2)

Unnamed: 0,Unigram,Bigram,Trigram,Unigram_detail,Bigram_detail,Trigram_detail,data_set,task_type
3748,49.123593,16.012815,11.313708,"(Plasma Pi, tibia breaking strength, and perce...","(Plasma Pi, tibia breaking strength, and perce...","(Plasma Pi, tibia breaking strength, and perce...",BC2GM text,NER
4355,44.992127,17.407766,5.598925,(The predicted vav oncogene protein sequence e...,(The predicted vav oncogene protein sequence e...,(The predicted vav oncogene protein sequence e...,BC2GM text,NER


In [7]:
df_bc2_gm.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BC2GM anno,count,6331.0,6331.0,6331.0
BC2GM anno,mean,70.558642,19.611408,5.467155
BC2GM anno,std,35.991581,35.818859,20.917909
BC2GM anno,min,0.0,0.0,0.0
BC2GM anno,25%,57.735027,0.0,0.0
BC2GM anno,50%,81.649658,0.0,0.0
BC2GM anno,75%,100.0,28.867513,0.0
BC2GM anno,max,100.0,100.0,100.0
BC2GM text,count,5000.0,5000.0,5000.0
BC2GM text,mean,47.618043,18.005105,9.418352


In [8]:
df_bc2_gm_text.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Although RAD17, RAD24 and MEC3 are not required for cell cycle arrest when S phase is inhibited by hydroxyurea (HU), they do contribute to the viability of yeast cells grown in the presence of HU, possibly because they are required for the repair of HU-induced DNA damage.',
 'Although RAD17, RAD24 and MEC3 are not required for cell cycle arrest when S phase is inhibited by hydroxyurea (HU), they do contribute to the viability of yeast cells grown in the presence of HU, possibly because they are required for the repair of HU-induced DNA damage.')

In [9]:
df_bc2_gm_text.sort_values(by=["Trigram"], ascending=False).head(n=1)["Trigram_detail"].iloc[0]

('Although RAD17, RAD24 and MEC3 are not required for cell cycle arrest when S phase is inhibited by hydroxyurea (HU), they do contribute to the viability of yeast cells grown in the presence of HU, possibly because they are required for the repair of HU-induced DNA damage.',
 'Although RAD17, RAD24 and MEC3 are not required for cell cycle arrest when S phase is inhibited by hydroxyurea (HU), they do contribute to the viability of yeast cells grown in the presence of HU, possibly because they are required for the repair of HU-induced DNA damage.')

## 2. AIMED (Random)

In [10]:
aimed_file = os.path.join("tmp", "AIMedFull_preprocessed.json")

In [11]:
from aimed_random import AIMedRandom

result_score, result_detail = AIMedRandom().run_similarity_comparer(aimed_file)
df_aimed_random = scores_to_df(result_score,result_detail, "AIMED (R)", "", "REL")

In [12]:
df_aimed_random.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (R),count,584.0,584.0,584.0
AIMED (R),mean,97.197474,86.321462,80.099087
AIMED (R),std,10.949881,17.633559,19.662401
AIMED (R),min,38.892223,12.171612,3.872015
AIMED (R),25%,100.0,85.714286,77.777778
AIMED (R),50%,100.0,90.909091,85.54007
AIMED (R),75%,100.0,95.581428,90.625
AIMED (R),max,100.0,100.0,100.0


In [13]:
df_aimed_random.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Cross-linking of 125I- PROTEIN and 125I- PROTEIN1 to 293/ PROTEIN2 cells yielded predominant complexes with apparent molecular weights of 211,000 for PROTEIN and 205,000 and 244,000 for PROTEIN , suggesting these complexes contain two or three PROTEIN molecules.',
 'Cross-linking of 125I- PROTEIN and 125I- PROTEIN1 to 293/ PROTEIN cells yielded predominant complexes with apparent molecular weights of 211,000 for PROTEIN and 205,000 and 244,000 for PROTEIN , suggesting these complexes contain two or three PROTEIN2 molecules.')

In [14]:
df_aimed_random.sort_values(by=["Unigram"], ascending=True).head(n=1)["Unigram_detail"].iloc[0]

('Evidently, large hydrophobic side chains of Leu13 and Phe36 play pivotal roles in stabilizing PROTEIN1 - PROTEIN2 interactions.',
 'Activation of PROTEIN1 by PROTEIN2 in NIH 3T3 cells and in vitro.')

## 3. AIMED (Unqiue Document)

In [15]:
from aimed_uniquedoc import AIMedUniqueDoc

result_score, result_detail = AIMedUniqueDoc().run_similarity_comparer(aimed_file)
df_aimed_unique = scores_to_df(result_score,result_detail, "AIMED (U)", "", "REL")

In [16]:
df_aimed_unique.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (U),count,498.0,498.0,498.0
AIMED (U),mean,67.759493,31.843862,17.321522
AIMED (U),std,11.776797,13.327418,9.102501
AIMED (U),min,40.016337,11.009638,0.0
AIMED (U),25%,57.824895,21.709067,10.846523
AIMED (U),50%,65.465367,28.284271,15.430335
AIMED (U),75%,81.144083,38.340066,22.36068
AIMED (U),max,85.53372,66.205112,47.583095


In [17]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=False).head(n=1)["Unigram_detail"].iloc[0]

('Interaction of the PROTEIN1 subunit of PROTEIN with the PROTEIN receptor ( PROTEIN ) and the PROTEIN2 receptor ( PROTEIN ) was investigated using the two-hybrid system by assessing for his3 and lacZ activation in S. cerevisiae.',
 'Evidence for a differential interaction of PROTEIN and the PROTEIN ( PROTEIN ) with the PROTEIN1 ( PROTEIN2 ) receptor in the yeast two-hybrid system.')

In [18]:
df_aimed_unique.sort_values(by=["Unigram"], ascending=True).head(n=1)["Unigram_detail"].iloc[0]

(' PROTEIN1 controls CDK activity, thereby affecting cell-cycle control, whereas PROTEIN2 functions in both DNA replication and repair.',
 'Cotransfection experiments indicate that PROTEIN1 and PROTEIN2 function in a mutually antagonistic manner to control cell cycle progression.')

## 4. SST2 Dataset

In [19]:
sst2_sentences_file = os.path.join("tmp", "datasetSentences.txt")
sst2_sentiment_labels_file  =  os.path.join("tmp", "sentiment_labels.txt")
sst2_dictionary_file  =  os.path.join("tmp", "dictionary.txt") 
sst2_datatset_split_file  =  os.path.join("tmp", "datasetSplit.txt") 



In [20]:
from sst2_dataset import SST2Dataset
result_score, result_detail =  SST2Dataset(sst2_sentences_file, sst2_sentiment_labels_file, sst2_datatset_split_file, sst2_dictionary_file).run_similarity_comparer()
df_sst2 = scores_to_df(result_score,result_detail, "SST2", "", "CLS")

Text not found in dictionary: But in Imax 3-D , the clichÃ©s disappear into the vertiginous perspectives opened up by the photography .
Text not found in dictionary: -LRB- But it 's -RRB- worth recommending because of two marvelous performances by Michael Caine and Brendan Fraser .
Text not found in dictionary: JirÃ­ Hubac 's script is a gem .
Text not found in dictionary: You would n't call The Good Girl a date movie -LRB- an anti-date movie is more like it -RRB- , but when it 's good , it 's good and horrid .
Text not found in dictionary: An incendiary , deeply thought-provoking look at one of the most peculiar -LRB- and peculiarly venomous -RRB- bigotries in our increasingly frightening theocracy
Text not found in dictionary: MÃ¼nch 's genuine insight makes the film 's occasional overindulgence forgivable .
Text not found in dictionary: I enjoyed the ride -LRB- bumps and all -RRB- , creamy depth , and ultimate theme .
Text not found in dictionary: As a randy film about sexy people i

Text not found in dictionary: Unfortunately , it appears that -LRB- Jackie -RRB- Chan 's US influence is starting to show in his Hong Kong films .
Text not found in dictionary: An effectively creepy , fear-inducing -LRB- not fear-reducing -RRB- film from Japanese director Hideo Nakata , who takes the superstitious curse on chain letters and actually applies it .
Text not found in dictionary: A subtle and well-crafted -LRB- for the most part -RRB- chiller .
Text not found in dictionary: -LRB- Chaiken 's -RRB- talent lies in an evocative , accurate observation of a distinctive milieu and in the lively , convincing dialogue she creates for her characters .
Text not found in dictionary: By candidly detailing the politics involved in the creation of an extraordinary piece of music , -LRB- Jones -RRB- calls our attention to the inherent conflict between commerce and creativity .
Text not found in dictionary: Audrey Tatou has a knack for picking roles that magnify her outrageous charm , and i

Text not found in dictionary: All right , so it 's not a brilliant piece of filmmaking , but it is a funny -LRB- sometimes hilarious -RRB- comedy with a deft sense of humor about itself , a playful spirit and a game cast .
Text not found in dictionary: -LRB- Danny Huston gives -RRB- an astounding performance that deftly , gradually reveals a real human soul buried beneath a spellbinding serpent 's smirk .
Text not found in dictionary: Thanks to The ChÃ¢teau 's balance of whimsicality , narrative discipline and serious improvisation , almost every relationship and personality in the film yields surprises .
Text not found in dictionary: -LRB- A -RRB- real pleasure in its laid-back way .
Text not found in dictionary: It 's soulful and unslick , and that 's apparently just what -LRB- Aniston -RRB- has always needed to grow into a movie career .
Text not found in dictionary: A jaw-droppingly beautiful work that upends nearly every clichÃ© of Japanese animation while delivering a more than s

Text not found in dictionary: -LRB- Ramsay -RRB- visually transforms the dreary expanse of dead-end distaste the characters inhabit into a poem of art , music and metaphor .
Text not found in dictionary: A coming-of-age film that avoids the cartoonish clichÃ©s and sneering humor of the genre as it provides a fresh view of an old type -- the uncertain girl on the brink of womanhood .
Text not found in dictionary: Because Eight Legged Freaks is partly an homage to Them , Tarantula and other low - budget B-movie thrillers of the 1950s and '60s , the movie is a silly -LRB- but not sophomoric -RRB- romp through horror and hellish conditions .
Text not found in dictionary: Jolting into Charleston rhythms , the story has the sizzle of old news that has finally found the right vent -LRB- accurate ?
Text not found in dictionary: Who cares ? -RRB- .
Text not found in dictionary: Damon brings the proper conviction to his role as -LRB- Jason Bourne -RRB- .
Text not found in dictionary: -LRB- Barry

Text not found in dictionary: Bon appÃ©tit !
Text not found in dictionary: Like the best 60 Minutes exposÃ© , the film -LRB- at 80 minutes -RRB- is actually quite entertaining .
Text not found in dictionary: Nair does n't use -LRB- Monsoon Wedding -RRB- to lament the loss of culture .
Text not found in dictionary: Like Mike is a harmlessly naÃ¯ve slice of b-ball fantasy , fit for filling in during the real NBA 's off-season .
Text not found in dictionary: Massoud 's story is an epic , but also a tragedy , the record of a tenacious , humane fighter who was also the prisoner -LRB- and ultimately the victim -RRB- of history .
Text not found in dictionary: An intelligently made -LRB- and beautifully edited -RRB- picture that at the very least has a spark of life to it -- more than you can say for plenty of movies that flow through the Hollywood pipeline without a hitch .
Text not found in dictionary: -LRB- Fiji diver Rusi Vulakoro and the married couple Howard and Michelle Hall -RRB- show 

Text not found in dictionary: Tells -LRB- the story -RRB- with such atmospheric ballast that shrugging off the plot 's persnickety problems is simply a matter of -LRB- being -RRB- in a shrugging mood .
Text not found in dictionary: Other than the slightly flawed -LRB- and fairly unbelievable -RRB- finale , everything else is top shelf .
Text not found in dictionary: A violent initiation rite for the audience , as much as it is for Angelique , the -LRB- opening -RRB- dance guarantees Karmen 's enthronement among the cinema 's memorable women .
Text not found in dictionary: -LRB- Breheny 's -RRB- lensing of the New Zealand and Cook Island locations captures both the beauty of the land and the people .
Text not found in dictionary: One of -LRB- Jaglom 's -RRB- better efforts -- a wry and sometime bitter movie about love .
Text not found in dictionary: If you can read the subtitles -LRB- the opera is sung in Italian -RRB- and you like ` Masterpiece Theatre ' type costumes , you 'll enjoy t

Text not found in dictionary: The heedless impetuousness of youth is on full , irritating display in -LRB- this -RRB- meandering and pointless French coming-of-age import from writer-director Anne-Sophie Birot .
Text not found in dictionary: A bold -LRB- and lovely -RRB- experiment that will almost certainly bore most audiences into their own brightly colored dreams .
Text not found in dictionary: If you are curious to see the darker side of what 's going on with young TV actors -LRB- Dawson Leery did what ?!? -RRB- , or see some interesting storytelling devices , you might want to check it out , but there 's nothing very attractive about this movie .
Text not found in dictionary: -LRB- Davis -RRB- wants to cause his audience an epiphany , yet he refuses to give us real situations and characters .
Text not found in dictionary: ... unlike -LRB- Scorsese 's Mean Streets -RRB- , Ash Wednesday is essentially devoid of interesting characters or even a halfway intriguing plot .
Text not foun

Text not found in dictionary: It 's like every bad idea that 's ever gone into an after-school special compiled in one place , minus those daytime programs ' slickness and sophistication -LRB- and who knew they even had any ? -RRB- .
Text not found in dictionary: Once -LRB- Kim -RRB- begins to overplay the shock tactics and bait-and-tackle metaphors , you may decide it 's too high a price to pay for a shimmering picture postcard .
Text not found in dictionary: But the power of these -LRB- subjects -RRB- is obscured by the majority of the film that shows a stationary camera on a subject that could be mistaken for giving a public oration , rather than contributing to a film 's narrative .
Text not found in dictionary: Basically a static series of semi-improvised -LRB- and semi-coherent -RRB- raps between the stars .
Text not found in dictionary: The script is n't very good ; not even someone as gifted as Hoffman -LRB- the actor -RRB- can make it work .
Text not found in dictionary: Its w

Text not found in dictionary: Overly stylized with lots of flash black - & - white freeze frames reminiscent of a pseudo-hip luxury car commercial , -LRB- it 's -RRB- at its worst when it 's actually inside the ring .
Text not found in dictionary: The director 's twitchy sketchbook style and adroit perspective shifts grow wearisome amid leaden pacing and indifferent craftsmanship -LRB- most notably wretched sound design -RRB- .
Text not found in dictionary: The parts are better than the whole -LRB- bizarre , funny , tragic - like love in New York -RRB- .
Text not found in dictionary: If this holiday movie is supposed to be a gift , somebody unwrapped it early , took out all the good stuff , and left behind the crap -LRB- literally -RRB- .
Text not found in dictionary: Much of what is meant to be ` inspirational ' and ` uplifting ' is simply distasteful to audiences not already sharing -LRB- the movie 's -RRB- mindset .
Text not found in dictionary: -LRB- Nelson 's -RRB- movie about mor

Text not found in dictionary: A collage of clichÃ©s and a dim echo of allusions to other films .
Text not found in dictionary: ... the good and different idea -LRB- of middle-aged romance -RRB- is not handled well and , except for the fine star performances , there is little else to recommend `` Never Again . ''
Text not found in dictionary: The film is like sitting in a downtown cafÃ© , overhearing a bunch of typical late-twenty-somethings natter on about nothing , and desperately wishing you could change tables .
Text not found in dictionary: `` -LRB- Hopkins -RRB- does n't so much phone in his performance as fax it .
Text not found in dictionary: While the transgressive trappings -LRB- especially the frank sex scenes -RRB- ensure that the film is never dull , Rodrigues 's beast-within metaphor is ultimately rather silly and overwrought , making the ambiguous ending seem goofy rather than provocative .
Text not found in dictionary: It 's mired in a shabby script that piles layer upon

Text not found in dictionary: The plan to make Enough into ` an inspiring tale of survival wrapped in the heart-pounding suspense of a stylish psychological thriller ' has flopped as surely as a soufflÃ© gone wrong .
Text not found in dictionary: -LRB- Creates -RRB- the worst kind of mythologizing , the kind that sacrifices real heroism and abject suffering for melodrama .
Text not found in dictionary: -LRB- Allen 's -RRB- been making piffle for a long while , and Hollywood Ending may be his way of saying that piffle is all that the airhead movie business deserves from him right now .
Text not found in dictionary: After the first 10 minutes , which is worth seeing , the movie sinks into an abyss of clichÃ©s , depression and bad alternative music .
Text not found in dictionary: The film was produced by Jerry Bruckheimer and directed by Joel Schumacher , and reflects the worst of their shallow styles : wildly overproduced , inadequately motivated every step of the way and demographically

Text not found in dictionary: The threat implied in the title PokÃ©mon 4ever is terrifying -- like locusts in a horde these things will keep coming .
Text not found in dictionary: Mushes the college-friends genre -LRB- The Big Chill -RRB- together with the contrivances and overwrought emotion of soap operas .
Text not found in dictionary: A rip-off twice removed , modeled after -LRB- Seagal 's -RRB- earlier copycat Under Siege , sometimes referred to as Die Hard on a boat .
Text not found in dictionary: Does n't deserve a passing grade -LRB- even on a curve -RRB- .
Text not found in dictionary: -LRB- Lee -RRB- treats his audience the same way that Jim Brown treats his women -- as dumb , credulous , unassuming , subordinate subjects .
Text not found in dictionary: Ultimately this is a frustrating patchwork : an uneasy marriage of Louis Begley 's source novel -LRB- About Schmidt -RRB- and an old Payne screenplay .
Text not found in dictionary: Marinated in clichÃ©s and mawkish dialogue .

In [21]:
df_sst2.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SST2,count,2125.0,2125.0,2125.0
SST2,mean,45.109967,18.970381,8.229678
SST2,std,10.061671,9.19579,8.270423
SST2,min,0.0,0.0,0.0
SST2,25%,38.332594,13.497638,0.0
SST2,50%,44.54354,17.342199,7.352146
SST2,75%,51.052747,22.645541,11.952286
SST2,max,100.0,100.0,71.443451


In [22]:
print(*df_sst2.sort_values(by=["Unigram"], ascending=False).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

Too simple for its own good .

... too sappy for its own good .


In [23]:
print(*df_sst2.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

Woo 's fights have a distinct flair .

When it comes to the battle of Hollywood vs. Woo , it looks like Woo 's a P.O.W.


## 5. BC3 Article classification

In [24]:
bc3_act_train_file = os.path.join("tmp", "bc3_act_all_records.tsv")
bc3_act_test_file = os.path.join("tmp", "bc3_act_all_records_test.tsv")

In [25]:
from bc3_article_classification import BC3ArticleClassification

result_score, result_detail = BC3ArticleClassification().run_similarity_comparer(bc3_act_train_file,  bc3_act_test_file)
df_bc3_act = scores_to_df(result_score,result_detail, "BC3ACT", "", "CLS")

In [26]:
df_bc3_act.describe()

Unnamed: 0,Unigram,Bigram,Trigram
count,6000.0,6000.0,6000.0
mean,62.347454,12.578808,3.105552
std,8.387323,5.646237,1.671733
min,15.024965,1.976839,0.0
25%,57.267651,8.413155,2.060621
50%,62.79838,11.451434,2.716863
75%,68.174696,15.685311,3.617432
max,85.939584,42.356289,17.414127


In [27]:
print(*df_bc3_act.sort_values(by=["Trigram"], ascending=False).head(n=1)["Trigram_detail"]
      .iloc[0], sep='\n\n')

Meiotic recombination is initiated by the formation of numerous DNA double-strand breaks (DSBs) catalysed by the widely conserved Spo11 protein. In Saccharomyces cerevisiae, Spo11 requires nine other proteins for meiotic DSB formation; however, unlike Spo11, few of these are conserved across kingdoms. In order to investigate this recombination step in higher eukaryotes, we took advantage of a high-throughput meiotic mutant screen carried out in the model plant Arabidopsis thaliana. A collection of 55,000 mutant lines was screened, and spo11-like mutations, characterised by a drastic decrease in chiasma formation at metaphase I associated with an absence of synapsis at prophase, were selected. This screen led to the identification of two populations of mutants classified according to their recombination defects: mutants that repair meiotic DSBs using the sister chromatid such as Atdmc1 or mutants that are unable to make DSBs like Atspo11-1. We found that in Arabidopsis thaliana at least

In [28]:
print(*df_bc3_act.sort_values(by=["Unigram"], ascending=True).head(n=10)["Unigram_detail"].iloc[5], sep='\n\n')

The sour global economy has left many small public firms gasping for air.

The vaccinia virus mRNA capping enzyme is a multifunctional heterodimeric protein associated with the viral polymerase that both catalyses the three steps of mRNA capping and regulates gene transcription. The structure of a subcomplex comprising the C-terminal N7-methyl-transferase (MT) domain of the large D1 subunit, the stimulatory D12 subunit and bound S-adenosyl-homocysteine (AdoHcy) has been determined at 2.7 A resolution and reveals several novel features of the poxvirus capping enzyme. The structure shows for the first time the critical role played by the proteolytically sensitive N-terminus of the MT domain in binding the methyl donor and in catalysis. In addition, the poxvirus enzyme has a completely unique mode of binding of the adenosine moiety of AdoHcy, a feature that could be exploited for design of specific anti-poxviral compounds. The structure of the poxvirus-specific D12 subunit suggests that i

## Summary

In [29]:
df_summary = pd.concat([df_sst2, df_aimed_unique,df_aimed_random, df_bc2_gm, df_bc3_act ])

In [30]:
df_summary.groupby(["data_set"]).describe().stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
data_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AIMED (R),count,584.0,584.0,584.0
AIMED (R),mean,97.197474,86.321462,80.099087
AIMED (R),std,10.949881,17.633559,19.662401
AIMED (R),min,38.892223,12.171612,3.872015
AIMED (R),25%,100.0,85.714286,77.777778
AIMED (R),50%,100.0,90.909091,85.54007
AIMED (R),75%,100.0,95.581428,90.625
AIMED (R),max,100.0,100.0,100.0
AIMED (U),count,498.0,498.0,498.0
AIMED (U),mean,67.759493,31.843862,17.321522


In [31]:
df_summary.groupby(["task_type","data_set"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unigram,Bigram,Trigram
task_type,data_set,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CLS,BC3ACT,62.347454,12.578808,3.105552
CLS,SST2,45.109967,18.970381,8.229678
NER,BC2GM anno,70.558642,19.611408,5.467155
NER,BC2GM text,47.618043,18.005105,9.418352
REL,AIMED (R),97.197474,86.321462,80.099087
REL,AIMED (U),67.759493,31.843862,17.321522


In [32]:
print(df_summary.groupby([ "data_set","task_type"]).mean().to_latex(float_format=lambda x: "{:.2f}".format(x)))

\begin{tabular}{llrrr}
\toprule
      &     &  Unigram &  Bigram &  Trigram \\
data\_set & task\_type &          &         &          \\
\midrule
AIMED (R)  & REL &    97.20 &   86.32 &    80.10 \\
AIMED (U)  & REL &    67.76 &   31.84 &    17.32 \\
BC2GM anno & NER &    70.56 &   19.61 &     5.47 \\
BC2GM text & NER &    47.62 &   18.01 &     9.42 \\
BC3ACT  & CLS &    62.35 &   12.58 &     3.11 \\
SST2  & CLS &    45.11 &   18.97 &     8.23 \\
\bottomrule
\end{tabular}

