# Similarity Splitter 

In [1]:
import os,sys, tempfile, pandas as pd, shutil, logging, glob
sys.path.append("src")

In [2]:
base_data_dir = "tmp"

In [3]:
working_dir =  "tempspliiterhajkA"
if os.path.exists(working_dir) : shutil.rmtree(working_dir)
os.makedirs(working_dir, exist_ok=True)

In [4]:
def scores_to_df(scores, details, data_set, data_type, task_type):
    df = pd.DataFrame()
    for k , v in scores.items():
        df[k] = v
        
    for k , v in details.items():
        df[k + "_detail"] = v
    
    df["data_set"] = data_set + " " + data_type
    df["task_type"] = task_type
         
    return df

In [5]:
logging.basicConfig(level="WARN", handlers=[logging.StreamHandler(sys.stdout)],
                        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

### 1. BC2GM

[Biocreative II gene mention](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-ii/) overlap. Please download the test and train files for this task from the BioCreative Website.


In [6]:
bc2gm_train_file = os.path.join(base_data_dir, "train.in")
bc2gm_test_file = os.path.join(base_data_dir, "test.in")

bc2gm_train_eval_file = os.path.join(base_data_dir, "trainGENE.eval")
bc2gm_test_eval_file = os.path.join(base_data_dir, "testGENE.eval")

bc2gm_test_alteval_file = os.path.join(base_data_dir, "testALTGENE.eval")

# This is the predictions
bc2gm_predictions_file = os.path.join(base_data_dir, "bc2gm_result_test_pred.txt")

In [7]:
from bc2_gene_mention import BC2GeneMentionText 


bc2gmrun = BC2GeneMentionText()
thresholds = [0,25,50,75,100]
ngrams = [1]


### BC2GM : Threshold based simlarity

In [8]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_detail = bc2gmrun.run_similarity_threshold_splitter("text", bc2gm_train_file,  bc2gm_test_file, bc2gm_out_dir_text, bc2gm_test_eval_file, bc2gm_test_alteval_file,bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_text = pd.DataFrame(result_detail)
df_bc2_gm_text["dataset"] = "BC2GM (text)"

In [9]:
df_bc2_gm_text.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25,84,1.68,0.7,0.538462,1.0,BC2GM (text)
1,1,25,50,2910,58.2,0.816769,0.770923,0.868414,BC2GM (text)
2,1,50,75,1956,39.12,0.818354,0.780656,0.859877,BC2GM (text)
3,1,75,100,16,0.32,0.727273,0.8,0.666667,BC2GM (text)
4,-1,0,100,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


In [10]:
bc2gm_out_dir_anno =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_anno, exist_ok=True)
result_detail = bc2gmrun.run_similarity_threshold_splitter("eval", bc2gm_train_eval_file,  bc2gm_test_eval_file, bc2gm_out_dir_anno, bc2gm_test_eval_file, bc2gm_test_alteval_file, bc2gm_predictions_file ,thresholds,ngrams)
df_bc2_gm_anno = pd.DataFrame(result_detail)
df_bc2_gm_anno["dataset"] = "BC2GM (anno)"

In [11]:
df_bc2_gm_anno.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25,1056,16.679829,0.844543,0.843792,0.845296,BC2GM (anno)
1,1,25,50,396,6.254936,0.806846,0.797101,0.816832,BC2GM (anno)
2,1,50,75,1547,24.435318,0.834171,0.813953,0.855418,BC2GM (anno)
3,1,75,100,978,15.447797,0.863286,0.840774,0.887036,BC2GM (anno)
4,-1,0,100,6331,100.0,0.817055,0.774819,0.86416,BC2GM (anno)


### BC2GM : Sorted based simlarity

In [12]:
num_parts = 4

In [13]:

bc2gm_out_dir_text =  os.path.join(working_dir, "bc2gm_text")
os.makedirs(bc2gm_out_dir_text, exist_ok=True)
result_score, result_detail = bc2gmrun.run_similarity_parts_splitter("text", bc2gm_train_file,  bc2gm_test_file, bc2gm_out_dir_text, bc2gm_test_eval_file, bc2gm_test_alteval_file,bc2gm_predictions_file ,num_parts)
df_bc2_gm_text_parts_score = pd.DataFrame(result_score)
df_bc2_gm_text_parts_score["dataset"] = "BC2GM (text)"

Exact matches Unigram, 39 / 5000
Exact matches Bigram, 26 / 5000
Exact matches Trigram, 26 / 5000


In [14]:
df_bc2_gm_text_parts_score.head()

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0.0,40.325254,1250,25.0,0.798133,0.741866,0.863636,BC2GM (text)
1,1,40.327957,47.140452,1250,25.0,0.820708,0.777716,0.868731,BC2GM (text)
2,1,47.140452,54.554473,1250,25.0,0.826751,0.790136,0.866924,BC2GM (text)
3,1,54.554473,100.0,1250,25.0,0.81687,0.780074,0.857309,BC2GM (text)
4,-1,0.0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


## 2. SST2 Dataset

In [15]:
sst2_sentences_file = os.path.join("tmp", "datasetSentences.txt")
sst2_sentiment_labels_file  =  os.path.join("tmp", "sentiment_labels.txt")
sst2_dictionary_file  =  os.path.join("tmp", "dictionary.txt") 
sst2_datatset_split_file  =  os.path.join("tmp", "datasetSplit.txt") 



In [16]:
from sst2_dataset import SST2Dataset
SST2Dataset(sst2_sentences_file, sst2_sentiment_labels_file, sst2_datatset_split_file, sst2_dictionary_file)


































<sst2_dataset.SST2Dataset at 0x122dbce90>

## 3. BC3 Article classification

In [17]:
bc3_act_train_file = os.path.join("tmp", "bc3_act_all_records.tsv")
bc3_act_test_file = os.path.join("tmp", "bc3_act_all_records_test.tsv")

In [18]:
from bc3_article_classification import BC3ArticleClassification

BC3ArticleClassification()


<bc3_article_classification.BC3ArticleClassification at 0x11f1a1d10>

## Summary

In [19]:
formatters = { t:lambda x: "{:.2%}".format(x).replace("%","")  for t in {"f-score", "precision", "recall"}}
float_formatter = lambda x: "{:.2f}".format(x)

### Summary Threshold

In [20]:
df_threshold_summary = pd.concat([df_bc2_gm_text])

In [21]:
df_threshold_summary

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0,25,84,1.68,0.7,0.538462,1.0,BC2GM (text)
1,1,25,50,2910,58.2,0.816769,0.770923,0.868414,BC2GM (text)
2,1,50,75,1956,39.12,0.818354,0.780656,0.859877,BC2GM (text)
3,1,75,100,16,0.32,0.727273,0.8,0.666667,BC2GM (text)
4,-1,0,100,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


In [22]:
print(df_threshold_summary.to_latex(float_format=float_formatter, formatters=formatters))

\begin{tabular}{lrrrrrrrrl}
\toprule
{} &  ngram &  min &  max &   num &  percent & f-score & precision & recall &       dataset \\
\midrule
0 &      1 &    0 &   25 &    84 &     1.68 &   70.00 &     53.85 & 100.00 &  BC2GM (text) \\
1 &      1 &   25 &   50 &  2910 &    58.20 &   81.68 &     77.09 &  86.84 &  BC2GM (text) \\
2 &      1 &   50 &   75 &  1956 &    39.12 &   81.84 &     78.07 &  85.99 &  BC2GM (text) \\
3 &      1 &   75 &  100 &    16 &     0.32 &   72.73 &     80.00 &  66.67 &  BC2GM (text) \\
4 &     -1 &    0 &  100 &  5000 &   100.00 &   81.71 &     77.48 &  86.42 &  BC2GM (text) \\
\bottomrule
\end{tabular}



### Summary Split by quartile

In [23]:
df_quartile_summary = pd.concat([df_bc2_gm_text_parts_score])

In [24]:
df_quartile_summary

Unnamed: 0,ngram,min,max,num,percent,f-score,precision,recall,dataset
0,1,0.0,40.325254,1250,25.0,0.798133,0.741866,0.863636,BC2GM (text)
1,1,40.327957,47.140452,1250,25.0,0.820708,0.777716,0.868731,BC2GM (text)
2,1,47.140452,54.554473,1250,25.0,0.826751,0.790136,0.866924,BC2GM (text)
3,1,54.554473,100.0,1250,25.0,0.81687,0.780074,0.857309,BC2GM (text)
4,-1,0.0,100.0,5000,100.0,0.817055,0.774819,0.86416,BC2GM (text)


In [25]:
print(df_quartile_summary.to_latex( float_format=float_formatter, formatters=formatters))

\begin{tabular}{lrrrrrrrrl}
\toprule
{} &  ngram &   min &    max &   num &  percent & f-score & precision & recall &       dataset \\
\midrule
0 &      1 &  0.00 &  40.33 &  1250 &    25.00 &   79.81 &     74.19 &  86.36 &  BC2GM (text) \\
1 &      1 & 40.33 &  47.14 &  1250 &    25.00 &   82.07 &     77.77 &  86.87 &  BC2GM (text) \\
2 &      1 & 47.14 &  54.55 &  1250 &    25.00 &   82.68 &     79.01 &  86.69 &  BC2GM (text) \\
3 &      1 & 54.55 & 100.00 &  1250 &    25.00 &   81.69 &     78.01 &  85.73 &  BC2GM (text) \\
4 &     -1 &  0.00 & 100.00 &  5000 &   100.00 &   81.71 &     77.48 &  86.42 &  BC2GM (text) \\
\bottomrule
\end{tabular}

