In [None]:
import struct
import os
import numpy as np
import pandas as pd
import random
import plotly.graph_objects as go
import math

In [None]:
COMP_FILE = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng/sample_embeddings_001825.comparisons"
COMP_FILE_Q001 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_001825.comparisons"
COMP_FILE_Q001_04 = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01_margin0.4/sample_embeddings_q0.01_margin0.4_002281.comparisons"
COMP_FILE_Q0001 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_002281.comparisons"
COMP_FILE_Q001_LSTM40_3LAYERS = "/media/eduseiti/bigdata02/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_lstm40_3layers_002281.comparisons"

COMP_FILE_Q001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.01/sample_embeddings_q0.01_big_002281.comparisons"
COMP_FILE_Q0001_BIG = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04 = "/media/eduseiti/Seagate Expansion Drive1/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.4_002281.comparisons"
COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05 = "/media/eduseiti/Seagate Expansion Drive/eduseiti/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001/sample_embeddings_q0.001_big_lstm40_3layer_margin0.5_002281.comparisons"

COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048 = "/mnt/f633ac7c-3153-4566-a009-229a0ae5f8a1/unicamp/doutorado/bootstrap.pytorch/data/linfeng_q0.001_margin0.48/sample_embeddings_q0.001_big_lstm40_3layer_margin0.48_002281.comparisons"


STRUCT_FIELDS = "BIBId"


In [None]:
def decode_comparisons_file(comparisons_filename):
    
    comparisons = []

    with open(comparisons_filename, "rb") as inputFile:
        while True:
            record = inputFile.read(struct.calcsize(STRUCT_FIELDS))

            if not record:
                break
            else:
                unpacked = struct.unpack_from(STRUCT_FIELDS, record)
                
                comparisons.append(unpacked)
                
                if math.isnan(unpacked[4]):
                    print("nan: {}".format(record))

    print("Decoded {} comparisons from {}".format(len(comparisons), comparisons_filename))
    
    return np.array(comparisons)

In [None]:
def plot_comparissons_histogram(comparisons_filename):
    
    comparisons = decode_comparisons_file(comparisons_filename)
    comparisons_df = pd.DataFrame(comparisons, columns = ["file_1", "scannr_1", "file_2", "scannr_2", "cosine_similarity"])
    
    print(comparisons_df['cosine_similarity'].describe(percentiles=list(np.round(np.arange(0.0, 1.0, 0.05), 2))))
    
    cosSim_histogram, costSim_bin_edges = np.histogram(comparisons_df['cosine_similarity'].loc[list(random.sample(range(len(comparisons)), int(len(comparisons) * 0.1)))], 1000)

    fig = go.Figure()

    fig.add_trace(go.Bar(y=cosSim_histogram,
                         x=costSim_bin_edges[1:],
                         marker_color='red'))
    
    fig.show()
    
    return comparisons_df, cosSim_histogram, costSim_bin_edges

### Similarities sample (10%) histogram of clustering using all distances

In [None]:
_, embeddings_hist, _ = plot_comparissons_histogram(COMP_FILE)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.01

In [None]:
_, embeddings_q001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001

In [None]:
_, embeddings_q0001_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and margin 0.4

In [None]:
_, embeddings_q001_04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_04)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 and a LSTM 40 3 layers model

In [None]:
_, embeddings_q001_lstm40_3layers_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_LSTM40_3LAYERS)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.01 with bigger training dataset

In [None]:
_, embeddings_q001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q001_BIG)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset

In [None]:
_, embeddings_q0001_big_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model

In [None]:
_, embeddings_q0001_big_lstm40_3layer_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.4

In [None]:
_, embeddings_q0001_big_lstm40_3layer_margin04_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M04)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.5

In [None]:
_, embeddings_q0001_big_lstm40_3layer_margin05_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M05)

### Similarities sample (10%) histogram of clustering using only identifications with q < 0.001 with bigger training dataset and LSTM40 3-layer model trained with margin 0.48

In [None]:
_, embeddings_q0001_big_lstm40_3layer_margin048_hist, _ = plot_comparissons_histogram(COMP_FILE_Q0001_BIG_LSTM40_3LAYER_M048)