# Analyse Tokeniser 

In [1]:
model_dir = "./tests/temp/biobert"
bucket = "aegovan-data"
trainfile = "s3://{}/processed_dataset/train_multiclass.json".format(bucket)
testfile = "s3://{}/processed_dataset/test_multiclass.json".format(bucket)
valfile = "s3://{}/processed_dataset/val_multiclass.json".format(bucket)



column="normalised_abstract"


In [2]:
data_file="test_ensemble.json"


In [3]:
from pytorch_pretrained_bert import BertTokenizer


tokeniser = BertTokenizer.from_pretrained(model_dir, do_lower_case=False)

In [4]:
import boto3

def download_single_file(bucket_name_path, local_path):
    index = bucket_name_path.find("://")

    # remove the s3:// if present
    if index > -1:
        bucket_name_path = bucket_name_path[index + 3:]

    key_start_index = bucket_name_path.find("/")
    bucket_name = bucket_name_path
    key = "/"
    if key_start_index > -1:
        bucket_name = bucket_name_path[0:key_start_index]
        key = bucket_name_path[key_start_index + 1:]
        
    client = boto3.resource('s3')
    client.Bucket(bucket_name).download_file(key, local_path)



In [5]:
import json
import pandas as pd


def get_counts(input_file, column):
    data = pd.read_json(input_file)
            
    counts = []
    
    data["tokens"] =  data.apply (lambda x:  tokeniser.tokenize(x[column]), axis=1)
    data["token_len"] = data.apply (lambda x:  len(x["tokens"]), axis=1)
    return data

def get_counts_unique(input_file, column):
    data = pd.read_json(input_file)
            
    pubmed_abstracts = pd.DataFrame( data[column].unique(), columns = [column])
    
    
    result = pd.DataFrame()
    
    result["tokens"] =  pubmed_abstracts.apply (lambda x:  tokeniser.tokenize(x[column]), axis=1)
    result["token_len"] = result.apply (lambda x:  len(x["tokens"]), axis=1)
    return result

In [14]:
import os

def download_and_get_counts(s3_file):
    local_file = os.path.join(".", s3_file.split("/")[-1] )
    download_single_file(s3_file, local_file) 
    df = get_counts (local_file, column)
    df_unique = get_counts_unique (local_file, column)
    
    return df, df_unique
    


In [10]:
df_train, df_train_unique  = download_and_get_counts(trainfile)

df_train["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    3223.000000
mean      375.686007
std       102.796268
min        57.000000
0%         57.000000
10%       253.000000
20%       281.000000
30%       311.000000
40%       339.000000
50%       368.000000
60%       395.000000
70%       430.000000
80%       475.000000
90%       513.000000
max       795.000000
Name: token_len, dtype: float64

In [11]:
df_train_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    459.000000
mean     348.372549
std      105.209868
min       57.000000
0%        57.000000
10%      228.000000
20%      258.000000
30%      280.400000
40%      311.000000
50%      333.000000
60%      362.000000
70%      396.200000
80%      432.000000
90%      488.200000
max      795.000000
Name: token_len, dtype: float64

In [15]:
df_test, df_test_unique  = download_and_get_counts(testfile)
df_test_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    116.000000
mean     338.974138
std      103.532432
min      175.000000
0%       175.000000
10%      223.000000
20%      247.000000
30%      275.000000
40%      297.000000
50%      317.500000
60%      344.000000
70%      381.500000
80%      417.000000
90%      513.000000
max      618.000000
Name: token_len, dtype: float64

In [16]:
df_test["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    894.000000
mean     378.592841
std      110.356765
min      175.000000
0%       175.000000
10%      236.000000
20%      283.600000
30%      302.000000
40%      338.000000
50%      354.000000
60%      392.000000
70%      431.000000
80%      520.000000
90%      533.000000
max      618.000000
Name: token_len, dtype: float64

In [19]:
df_val, df_val_unique  = download_and_get_counts(valfile)
df_val_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count     65.000000
mean     349.815385
std      102.990426
min      142.000000
0%       142.000000
10%      215.000000
20%      254.400000
30%      298.800000
40%      320.200000
50%      347.000000
60%      362.400000
70%      402.800000
80%      438.400000
90%      478.400000
max      586.000000
Name: token_len, dtype: float64

In [20]:
df_val["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

count    420.000000
mean     357.330952
std       90.513505
min      142.000000
0%       142.000000
10%      237.000000
20%      284.000000
30%      315.000000
40%      330.000000
50%      358.000000
60%      363.000000
70%      398.000000
80%      438.000000
90%      464.000000
max      586.000000
Name: token_len, dtype: float64

In [24]:
df = pd.DataFrame()
df["train_unique"] = df_train_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])
df["test_unique"] = df_test_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])
df["val_unique"] = df_val_unique["token_len"].describe(percentiles=[i/100 for i in range(0,100,10)])

In [32]:
print(df.round(2).to_latex())


\begin{tabular}{lrrr}
\toprule
{} &  train\_unique &  test\_unique &  val\_unique \\
\midrule
count &        459.00 &       116.00 &       65.00 \\
mean  &        348.37 &       338.97 &      349.82 \\
std   &        105.21 &       103.53 &      102.99 \\
min   &         57.00 &       175.00 &      142.00 \\
0\%    &         57.00 &       175.00 &      142.00 \\
10\%   &        228.00 &       223.00 &      215.00 \\
20\%   &        258.00 &       247.00 &      254.40 \\
30\%   &        280.40 &       275.00 &      298.80 \\
40\%   &        311.00 &       297.00 &      320.20 \\
50\%   &        333.00 &       317.50 &      347.00 \\
60\%   &        362.00 &       344.00 &      362.40 \\
70\%   &        396.20 &       381.50 &      402.80 \\
80\%   &        432.00 &       417.00 &      438.40 \\
90\%   &        488.20 &       513.00 &      478.40 \\
max   &        795.00 &       618.00 &      586.00 \\
\bottomrule
\end{tabular}



### Prediction Analysis on Seq length

In [None]:
from sklearn.metrics import f1_score, accuracy_score,precision_score, recall_score

def get_scores(df, actual, predicted, labels=None):

    f1 = f1_score(df[actual], df[predicted], average='macro' ,sample_weight=None, labels=labels)  
    p = precision_score(df[actual], df[predicted], average='macro' ,sample_weight=None)  
    r = recall_score(df[actual], df[ predicted], average='macro' ,sample_weight=None)  

    return f1,p,r

def plot_confusion_matrix(df, actual, predicted, save_file=None):
    import matplotlib.pyplot as plt
    import scikitplot as skplt


    skplt.metrics.plot_confusion_matrix(df[actual], df[predicted], normalize=True , figsize=(4,4),  x_tick_rotation=90)
  

    skplt.metrics.plot_confusion_matrix(df[actual], df[predicted], figsize=(4,4), normalize=False,x_tick_rotation=90 )
    
    if save_file:
        plt.savefig(save_file, bbox_inches="tight")
    
    plt.show()


In [None]:
positive_labels = list(set(df["actual"].unique().tolist()) - set("other"))

In [None]:
df.shape

In [None]:
get_scores ( df.query("token_len > 510"), "actual" , "ensemble_predicted")



In [None]:
plot_confusion_matrix(df.query("token_len > 510"), "actual" , "ensemble_predicted", "len_long_confusion.png")


In [None]:
get_scores ( df.query("token_len > 510"), "actual" , "ensemble_predicted", positive_labels)



In [None]:
plot_confusion_matrix(df.query("token_len < 510"), "actual" , "ensemble_predicted", "len_short_confusion.png")



In [None]:
df.query("token_len > 510").groupby( ["actual","ensemble_predicted" ])["actual","ensemble_predicted"].count()

In [None]:
df.shape

In [None]:
df.query("token_len >= 510")["normalised_abstract"].nunique()

In [None]:
df.query("token_len < 510")["normalised_abstract"].nunique()

In [None]:
df.query("token_len >= 510").shape

In [None]:
get_scores ( df.query("token_len < 510"), "actual" , "ensemble_predicted")

In [None]:
get_scores ( df.query("token_len < 510"), "actual" , "ensemble_predicted", positive_labels)

In [None]:
get_scores ( df, "actual" , "ensemble_predicted")