# Analyse Tokeniser 

In [1]:
model_dir = "./tests/temp/biobert"
bucket = "aegovan-data"
trainfile = "s3://{}/processed_dataset/train_multiclass.json".format(bucket)
testfile = "s3://{}/processed_dataset/test_multiclass.json".format(bucket)
valfile = "s3://{}/processed_dataset/val_multiclass.json".format(bucket)



column="normalised_abstract"


In [2]:
data_file="test_ensemble.json"


In [3]:
from pytorch_pretrained_bert import BertTokenizer


tokeniser = BertTokenizer.from_pretrained(model_dir, do_lower_case=False)

In [4]:
import boto3

def download_single_file(bucket_name_path, local_path):
    index = bucket_name_path.find("://")

    # remove the s3:// if present
    if index > -1:
        bucket_name_path = bucket_name_path[index + 3:]

    key_start_index = bucket_name_path.find("/")
    bucket_name = bucket_name_path
    key = "/"
    if key_start_index > -1:
        bucket_name = bucket_name_path[0:key_start_index]
        key = bucket_name_path[key_start_index + 1:]
        
    client = boto3.resource('s3')
    client.Bucket(bucket_name).download_file(key, local_path)



In [5]:
import json
import pandas as pd


def get_counts(input_file, column):
    data = pd.read_json(input_file)
            
    counts = []
    
    data["tokens"] =  data.apply (lambda x:  tokeniser.tokenize(x[column]), axis=1)
    data["token_len"] = data.apply (lambda x:  len(x["tokens"]), axis=1)
    return data

def get_counts_unique(input_file, column):
    data = pd.read_json(input_file)
            
    pubmed_abstracts = pd.DataFrame( data[column].unique(), columns = [column])
    
    
    result = pd.DataFrame()
    
    result["tokens"] =  pubmed_abstracts.apply (lambda x:  tokeniser.tokenize(x[column]), axis=1)
    result["token_len"] = result.apply (lambda x:  len(x["tokens"]), axis=1)
    return result

In [6]:
import os

def download_and_get_counts(s3_file):
    local_file = os.path.join(".", s3_file.split("/")[-1] )
    download_single_file(s3_file, local_file) 
    df = get_counts (local_file, column)
    df_unique = get_counts_unique (local_file, column)
    
    return df, df_unique
    


In [7]:
df_train, df_train_unique  = download_and_get_counts(trainfile)

df_train["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    2841.000000
mean      379.406899
std       101.442386
min       142.000000
0%        142.000000
10%       246.000000
20%       281.000000
30%       314.000000
40%       348.000000
50%       380.000000
60%       415.000000
70%       446.000000
80%       477.000000
90%       514.000000
max       740.000000
Name: token_len, dtype: float64

In [8]:
df_train_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    358.000000
mean     351.662011
std      101.722808
min      142.000000
0%       142.000000
10%      228.000000
20%      257.400000
30%      283.000000
40%      314.800000
50%      342.500000
60%      371.200000
70%      411.000000
80%      438.000000
90%      487.300000
max      740.000000
Name: token_len, dtype: float64

In [9]:
df_test, df_test_unique  = download_and_get_counts(testfile)
df_test_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count     90.000000
mean     327.222222
std       95.787426
min      127.000000
0%       127.000000
10%      211.800000
20%      247.800000
30%      279.200000
40%      297.600000
50%      323.000000
60%      339.800000
70%      369.000000
80%      398.400000
90%      439.900000
max      618.000000
Name: token_len, dtype: float64

In [10]:
df_test["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    720.000000
mean     362.105556
std       96.966708
min      127.000000
0%       127.000000
10%      243.000000
20%      282.000000
30%      317.000000
40%      333.000000
50%      363.000000
60%      375.000000
70%      400.000000
80%      431.000000
90%      486.100000
max      618.000000
Name: token_len, dtype: float64

In [11]:
df_val, df_val_unique  = download_and_get_counts(valfile)
df_val_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count     50.000000
mean     317.340000
std      105.959486
min       57.000000
0%        57.000000
10%      213.700000
20%      229.600000
30%      259.400000
40%      288.400000
50%      311.500000
60%      332.800000
70%      347.600000
80%      374.000000
90%      479.800000
max      586.000000
Name: token_len, dtype: float64

In [12]:
df_val["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    366.000000
mean     340.803279
std       95.829898
min       57.000000
0%        57.000000
10%      226.000000
20%      258.000000
30%      294.000000
40%      315.000000
50%      343.000000
60%      344.000000
70%      353.000000
80%      378.000000
90%      487.000000
max      586.000000
Name: token_len, dtype: float64

In [13]:
df = pd.DataFrame()
df["train_unique"] = df_train_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])
df["test_unique"] = df_test_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])
df["val_unique"] = df_val_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

In [14]:
print(df.round(2).to_latex())


\begin{tabular}{lrrr}
\toprule
{} &  train\_unique &  test\_unique &  val\_unique \\
\midrule
count &        358.00 &        90.00 &       50.00 \\
mean  &        351.66 &       327.22 &      317.34 \\
std   &        101.72 &        95.79 &      105.96 \\
min   &        142.00 &       127.00 &       57.00 \\
0\%    &        142.00 &       127.00 &       57.00 \\
10\%   &        228.00 &       211.80 &      213.70 \\
20\%   &        257.40 &       247.80 &      229.60 \\
30\%   &        283.00 &       279.20 &      259.40 \\
40\%   &        314.80 &       297.60 &      288.40 \\
50\%   &        342.50 &       323.00 &      311.50 \\
60\%   &        371.20 &       339.80 &      332.80 \\
70\%   &        411.00 &       369.00 &      347.60 \\
80\%   &        438.00 &       398.40 &      374.00 \\
90\%   &        487.30 &       439.90 &      479.80 \\
max   &        740.00 &       618.00 &      586.00 \\
\bottomrule
\end{tabular}



### Prediction Analysis on Seq length

In [None]:
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score

def get_scores(df, actual, predicted, labels=None):

    f1 = f1_score(df[actual], df[predicted], average='macro' ,sample_weight=None, labels=labels)  
    p = precision_score(df[actual], df[predicted], average='macro' ,sample_weight=None)  
    r = recall_score(df[actual], df[ predicted], average='macro' ,sample_weight=None)  

    return f1,p,r

def plot_confusion_matrix(df, actual, predicted, save_file=None):
    import matplotlib.pyplot as plt
    import scikitplot as skplt


    skplt.metrics.plot_confusion_matrix(df[actual], df[predicted], normalize=True , figsize=(4,4),  x_tick_rotation=90)
  

    skplt.metrics.plot_confusion_matrix(df[actual], df[predicted], figsize=(4,4), normalize=False,x_tick_rotation=90 )
    
    if save_file:
        plt.savefig(save_file, bbox_inches="tight")
    
    plt.show()


In [None]:
positive_labels = list(set(df["actual"].unique().tolist()) - set("other"))

In [None]:
df.shape

In [None]:
get_scores ( df.query("token_len > 510"), "actual" , "ensemble_predicted")



In [None]:
plot_confusion_matrix(df.query("token_len > 510"), "actual" , "ensemble_predicted", "len_long_confusion.png")


In [None]:
get_scores ( df.query("token_len > 510"), "actual" , "ensemble_predicted", positive_labels)



In [None]:
plot_confusion_matrix(df.query("token_len < 510"), "actual" , "ensemble_predicted", "len_short_confusion.png")



In [None]:
df.query("token_len > 510").groupby( ["actual","ensemble_predicted" ])["actual","ensemble_predicted"].count()

In [None]:
df.shape

In [None]:
df.query("token_len >= 510")["normalised_abstract"].nunique()

In [None]:
df.query("token_len < 510")["normalised_abstract"].nunique()

In [None]:
df.query("token_len >= 510").shape

In [None]:
get_scores ( df.query("token_len < 510"), "actual" , "ensemble_predicted")

In [None]:
get_scores ( df.query("token_len < 510"), "actual" , "ensemble_predicted", positive_labels)

In [None]:
get_scores ( df, "actual" , "ensemble_predicted")