# Similarity Splitter 

In [1]:
import os,sys, tempfile, pandas as pd, shutil, logging, glob
sys.path.append("src")

In [2]:
base_data_dir = "tmp"

In [3]:
working_dir =  "tempspliiterhajkA"
if os.path.exists(working_dir) : shutil.rmtree(working_dir)
os.makedirs(working_dir, exist_ok=True)

In [4]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

In [5]:
logging.basicConfig(level="WARN", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [6]:
thresholds = [0,25,50,75,100]
ngrams = [1]
num_parts = 4

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [7]:
bc2gm_train_file = os.path.join(base_data_dir, "train.in")
bc2gm_test_file = os.path.join(base_data_dir, "test.in")

bc2gm_train_eval_file = os.path.join(base_data_dir, "trainGENE.eval")
bc2gm_test_eval_file = os.path.join(base_data_dir, "testGENE.eval")

bc2gm_test_alteval_file = os.path.join(base_data_dir, "testALTGENE.eval")

# This is the predictions
bc2gm_predictions_file = os.path.join(base_data_dir, "bc2gm_result_test_pred.txt")

In [8]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()



### BC2GM : Threshold based simlarity

In [9]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_detail = bc2gmrun.run_similarity_threshold_splitter("text", bc2gm_train_file,  bc2gm_test_file, bc2gm_out_dir_text, bc2gm_test_eval_file, bc2gm_test_alteval_file,bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_text = pd.DataFrame(result_detail)
df_bc2_gm_text["dataset"] = "BC2GM (text)"

In [10]:
df_bc2_gm_text.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25.0,990,19.8,0.744872,0.688389,0.811453,BC2GM (text)
1,1,25,50.0,3706,74.12,0.823754,0.782895,0.869114,BC2GM (text)
2,1,50,75.0,257,5.14,0.870801,0.838308,0.905914,BC2GM (text)
3,1,75,,47,0.94,0.78481,0.794872,0.775,BC2GM (text)
4,-1,0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


In [11]:
bc2gm_out_dir_anno =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_anno, exist_ok=True)
result_detail = bc2gmrun.run_similarity_threshold_splitter("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file, bc2gm_out_dir_anno, bc2gm_test_eval_file, bc2gm_test_alteval_file, bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_anno = pd.DataFrame(result_detail)
df_bc2_gm_anno["dataset"] = "BC2GM (anno)"

In [12]:
df_bc2_gm_anno.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25.0,1059,16.727215,0.844237,0.843861,0.844613,BC2GM (anno)
1,1,25,50.0,352,5.559943,0.808743,0.8,0.81768,BC2GM (anno)
2,1,50,75.0,1565,24.719634,0.834215,0.813616,0.855885,BC2GM (anno)
3,1,75,,3355,52.993208,0.854116,0.831631,0.877852,BC2GM (anno)
4,-1,0,100.0,6331,100.0,0.817055,0.774819,0.86416,BC2GM (anno)


### BC2GM : Sorted based simlarity

In [13]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_score, result_detail = bc2gmrun.run_similarity_parts_splitter("text", bc2gm_train_file,  bc2gm_test_file, bc2gm_out_dir_text, bc2gm_test_eval_file, bc2gm_test_alteval_file,bc2gm_predictions_file ,num_parts)
df_bc2_gm_text_parts_score = pd.DataFrame(result_score)
df_bc2_gm_text_parts_score["dataset"] = "BC2GM (text)"

Exact matches Unigram, 39 / 5000
Exact matches Bigram, 26 / 5000
Exact matches Trigram, 26 / 5000


In [14]:
df_bc2_gm_text_parts_score.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0.0,26.311741,1250,25.0,0.754003,0.698113,0.81962,BC2GM (text)
1,1,26.311741,31.622777,1250,25.0,0.798122,0.745178,0.859164,BC2GM (text)
2,1,31.622777,38.332594,1250,25.0,0.821266,0.782819,0.863685,BC2GM (text)
3,1,38.348249,100.0,1250,25.0,0.858576,0.830023,0.889164,BC2GM (text)
4,-1,0.0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


## 2. SST2 Dataset

In [15]:
sst2_sentences_file = os.path.join("tmp", "datasetSentences.txt")
sst2_sentiment_labels_file  =  os.path.join("tmp", "sentiment_labels.txt")
sst2_dictionary_file  =  os.path.join("tmp", "dictionary.txt") 
sst2_datatset_split_file  =  os.path.join("tmp", "datasetSplit.txt") 


# This is the predictions
sst2_predictions_file = os.path.join(base_data_dir, "sst2-output.csv")

In [16]:
from sst2_dataset import SST2Dataset
sst2_run=SST2Dataset(sst2_sentences_file, sst2_sentiment_labels_file, sst2_datatset_split_file, sst2_dictionary_file)


































###  SST2 : Threshold based simlarity

In [17]:

sst2_out_dir_text =  os.path.join(working_dir, "sst2_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_detail = sst2_run.run_similarity_threshold_splitter(  sst2_predictions_file ,thresholds,ngrams)
df_sst2_text = pd.DataFrame(result_detail)
df_sst2_text["dataset"] = "SST2 (text)"

In [18]:
df_sst2_text.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,accuracy,dataset
0,1,0,25.0,269,12.658824,0.857143,0.902439,0.816176,0.862454,SST2 (text)
1,1,25,50.0,1565,73.647059,0.839974,0.871274,0.810845,0.84345,SST2 (text)
2,1,50,75.0,277,13.035294,0.851711,0.910569,0.8,0.859206,SST2 (text)
3,1,75,,14,0.658824,0.857143,0.857143,0.857143,0.857143,SST2 (text)
4,-1,0,100.0,2125,100.0,0.843735,0.879919,0.810409,0.848,SST2 (text)


###  SST2 : Sorted based simlarity

In [19]:
num_parts = 4

In [20]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_score, result_detail = sst2_run.run_similarity_parts_splitter(  sst2_predictions_file , num_parts)
df_sst2_text_parts_score = pd.DataFrame(result_score)
df_sst2_text_parts_score["dataset"] = "SST2 (text)"

In [21]:
df_sst2_text_parts_score.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,accuracy,dataset
0,1,0.0,28.347335,532,25.035294,0.834615,0.871486,0.800738,0.838346,SST2 (text)
1,1,28.347335,33.333333,532,25.035294,0.838346,0.892,0.79078,0.838346,SST2 (text)
2,1,33.333333,42.163702,532,25.035294,0.841509,0.864341,0.819853,0.842105,SST2 (text)
3,1,42.163702,100.0,529,24.894118,0.861856,0.893162,0.832669,0.873346,SST2 (text)
4,-1,0.0,100.0,2125,100.0,0.843735,0.879919,0.810409,0.848,SST2 (text)


## 3. BC3 Article classification

In [22]:
bc3_act_train_file = os.path.join("tmp", "bc3_act_all_records.tsv")
bc3_act_test_file = os.path.join("tmp", "bc3_act_all_records_test.tsv")


bc3_act_train_eval_file = os.path.join(base_data_dir, "bc3_act_gold_standard.tsv")
bc3_act_test_eval_file = os.path.join(base_data_dir, "bc3_act_gold_standard_test.tsv")


# This is the predictions
bc3_act_predictions_file = os.path.join(base_data_dir, "bc3act-output.csv")

In [23]:
from bc3_article_classification import BC3ArticleClassification

bc3actrun = BC3ArticleClassification()


###  BC3 Article classification : Threshold based simlarity

In [24]:

bc3act_out_dir_text =  os.path.join(working_dir, "bc3act_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_detail = bc3actrun.run_similarity_threshold_splitter( bc3_act_train_file,  bc3_act_test_file, bc3_act_test_eval_file, bc3_act_predictions_file ,thresholds,ngrams)
df_bc3_act_text = pd.DataFrame(result_detail)
df_bc3_act_text["dataset"] = "BC3ACT (text)"

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
df_bc3_act_text.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,accuracy,dataset
0,1,0,25.0,2817,46.95,0.56427,0.430233,0.81962,0.858005,BC3ACT (text)
1,1,25,50.0,3062,51.033333,0.598513,0.46,0.856383,0.788374,BC3ACT (text)
2,1,50,75.0,120,2.0,0.630137,0.534884,0.766667,0.775,BC3ACT (text)
3,1,75,,1,0.016667,0.0,0.0,0.0,0.0,BC3ACT (text)
4,-1,0,100.0,6000,100.0,0.587107,0.451061,0.840659,0.820667,BC3ACT (text)


###  BC3 Article classification : Sorted based simlarity

In [26]:
num_parts = 4

In [27]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_score, result_detail = bc3actrun.run_similarity_parts_splitter(bc3_act_train_file,  bc3_act_test_file, bc3_act_test_eval_file, bc3_act_predictions_file ,num_parts)
df_bc3_act_text_parts_score = pd.DataFrame(result_score)
df_bc3_act_text_parts_score["dataset"] = "BC3ACT (text)"

In [28]:
df_bc3_act_text_parts_score.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,accuracy,dataset
0,1,6.27761,20.090702,1500,25.0,0.575758,0.445312,0.814286,0.888,BC3ACT (text)
1,1,20.095193,25.676454,1500,25.0,0.559322,0.423077,0.825,0.826667,BC3ACT (text)
2,1,25.677981,31.913278,1500,25.0,0.602067,0.46507,0.85348,0.794667,BC3ACT (text)
3,1,31.917,75.011317,1500,25.0,0.598109,0.460838,0.851852,0.773333,BC3ACT (text)
4,-1,0.0,100.0,6000,100.0,0.587107,0.451061,0.840659,0.820667,BC3ACT (text)


## Summary

In [29]:
formatters = { t:lambda x: "{:.2%}".format(x).replace("%","")  for t in {"f-score", "precision", "recall"}}
formatters['min']=lambda x: "{:.1f}".format(x)
formatters['max']=lambda x: "{:.1f}".format(x)
formatters['percent']=lambda x: "{:.1f}".format(x)


float_formatter = lambda x: "{:.2f}".format(x)

### Summary Threshold

In [30]:
df_threshold_summary = pd.concat([df_bc2_gm_text, df_bc3_act_text,df_sst2_text])

In [31]:
df_threshold_summary.sort_values(by=["dataset", "ngram", "min", "max"])

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset,accuracy
4,-1,0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text),
0,1,0,25.0,990,19.8,0.744872,0.688389,0.811453,BC2GM (text),
1,1,25,50.0,3706,74.12,0.823754,0.782895,0.869114,BC2GM (text),
2,1,50,75.0,257,5.14,0.870801,0.838308,0.905914,BC2GM (text),
3,1,75,,47,0.94,0.78481,0.794872,0.775,BC2GM (text),
4,-1,0,100.0,6000,100.0,0.587107,0.451061,0.840659,BC3ACT (text),0.820667
0,1,0,25.0,2817,46.95,0.56427,0.430233,0.81962,BC3ACT (text),0.858005
1,1,25,50.0,3062,51.033333,0.598513,0.46,0.856383,BC3ACT (text),0.788374
2,1,50,75.0,120,2.0,0.630137,0.534884,0.766667,BC3ACT (text),0.775
3,1,75,,1,0.016667,0.0,0.0,0.0,BC3ACT (text),0.0


In [32]:
print(df_threshold_summary[["dataset", "ngram", "min", "max","num", "f-score", "precision" , "recall","accuracy" ]]
      .sort_values(by=["dataset", "ngram", "min", "max"])
      .to_latex(float_format=float_formatter, formatters=formatters, index=False))

\begin{tabular}{lrrrrrrrr}
\toprule
       dataset &  ngram &  min &   max &   num & f-score & precision & recall &  accuracy \\
\midrule
  BC2GM (text) &     -1 &  0.0 & 100.0 &  5000 &   81.71 &     77.48 &  86.42 &       nan \\
  BC2GM (text) &      1 &  0.0 &  25.0 &   990 &   74.49 &     68.84 &  81.15 &       nan \\
  BC2GM (text) &      1 & 25.0 &  50.0 &  3706 &   82.38 &     78.29 &  86.91 &       nan \\
  BC2GM (text) &      1 & 50.0 &  75.0 &   257 &   87.08 &     83.83 &  90.59 &       nan \\
  BC2GM (text) &      1 & 75.0 &   nan &    47 &   78.48 &     79.49 &  77.50 &       nan \\
 BC3ACT (text) &     -1 &  0.0 & 100.0 &  6000 &   58.71 &     45.11 &  84.07 &      0.82 \\
 BC3ACT (text) &      1 &  0.0 &  25.0 &  2817 &   56.43 &     43.02 &  81.96 &      0.86 \\
 BC3ACT (text) &      1 & 25.0 &  50.0 &  3062 &   59.85 &     46.00 &  85.64 &      0.79 \\
 BC3ACT (text) &      1 & 50.0 &  75.0 &   120 &   63.01 &     53.49 &  76.67 &      0.78 \\
 BC3ACT (text) &      1 &

### Summary Split by quartile

In [33]:
df_quartile_summary = pd.concat([df_bc2_gm_text_parts_score, df_bc3_act_text_parts_score,df_sst2_text_parts_score])

In [34]:
df_quartile_summary .sort_values(by=["dataset", "ngram","min","max" ])

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset,accuracy
4,-1,0.0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text),
0,1,0.0,26.311741,1250,25.0,0.754003,0.698113,0.81962,BC2GM (text),
1,1,26.311741,31.622777,1250,25.0,0.798122,0.745178,0.859164,BC2GM (text),
2,1,31.622777,38.332594,1250,25.0,0.821266,0.782819,0.863685,BC2GM (text),
3,1,38.348249,100.0,1250,25.0,0.858576,0.830023,0.889164,BC2GM (text),
4,-1,0.0,100.0,6000,100.0,0.587107,0.451061,0.840659,BC3ACT (text),0.820667
0,1,6.27761,20.090702,1500,25.0,0.575758,0.445312,0.814286,BC3ACT (text),0.888
1,1,20.095193,25.676454,1500,25.0,0.559322,0.423077,0.825,BC3ACT (text),0.826667
2,1,25.677981,31.913278,1500,25.0,0.602067,0.46507,0.85348,BC3ACT (text),0.794667
3,1,31.917,75.011317,1500,25.0,0.598109,0.460838,0.851852,BC3ACT (text),0.773333


In [35]:
print(df_quartile_summary.query("ngram in (1,-1)")
      .sort_values(by=["dataset", "ngram","min","max" ])
      [["dataset", "min", "max", "f-score", "precision" , "recall","accuracy" ]]
      .fillna("-")
      .to_latex( float_format=float_formatter, formatters=formatters, index=False))

\begin{tabular}{lrrrrrl}
\toprule
       dataset &  min &   max & f-score & precision & recall & accuracy \\
\midrule
  BC2GM (text) &  0.0 & 100.0 &   81.71 &     77.48 &  86.42 &        - \\
  BC2GM (text) &  0.0 &  26.3 &   75.40 &     69.81 &  81.96 &        - \\
  BC2GM (text) & 26.3 &  31.6 &   79.81 &     74.52 &  85.92 &        - \\
  BC2GM (text) & 31.6 &  38.3 &   82.13 &     78.28 &  86.37 &        - \\
  BC2GM (text) & 38.3 & 100.0 &   85.86 &     83.00 &  88.92 &        - \\
 BC3ACT (text) &  0.0 & 100.0 &   58.71 &     45.11 &  84.07 &     0.82 \\
 BC3ACT (text) &  6.3 &  20.1 &   57.58 &     44.53 &  81.43 &     0.89 \\
 BC3ACT (text) & 20.1 &  25.7 &   55.93 &     42.31 &  82.50 &     0.83 \\
 BC3ACT (text) & 25.7 &  31.9 &   60.21 &     46.51 &  85.35 &     0.79 \\
 BC3ACT (text) & 31.9 &  75.0 &   59.81 &     46.08 &  85.19 &     0.77 \\
   SST2 (text) &  0.0 & 100.0 &   84.37 &     87.99 &  81.04 &     0.85 \\
   SST2 (text) &  0.0 &  28.3 &   83.46 &     87.15 &  80