In [1]:
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf
from tensorflow import keras as keras
import numpy as np
import random
from keras.models import Model
from keras.layers import Input, Flatten, Dense, Lambda, Reshape
from keras.layers import AveragePooling1D,Conv1D
from keras import backend as K
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold 
from scipy.spatial.distance import euclidean






In [2]:
# Threshold values for all metrics (0 to 1 with 0.05 increments)
threshold_values = np.arange(0, 1.05, 0.05)
print(f"Threshold values: {threshold_values}")
print(f"Total threshold values: {len(threshold_values)}")

Threshold values: [0.   0.05 0.1  0.15 0.2  0.25 0.3  0.35 0.4  0.45 0.5  0.55 0.6  0.65
 0.7  0.75 0.8  0.85 0.9  0.95 1.  ]
Total threshold values: 21


In [3]:
#Siamese Network Implementation 
num_classes = 9
epochs = 20
 
def euclid_dis(vects):
  x,y = vects
  sum_square = K.sum(K.square(x-y), axis=1, keepdims=True)
  return K.sqrt(K.maximum(sum_square, K.epsilon()))
 
def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)
 
def contrastive_loss(y_true, y_pred):
    y_true=tf.cast(y_true, tf.float32)
    y_pred=tf.cast(y_pred, tf.float32)
    margin = 1
    square_pred = K.square(y_pred)
    margin_square = K.square(K.maximum(margin - y_pred, 0))
    return K.mean(y_true * square_pred + (1 - y_true) * margin_square)


def create_pairs_new3(x, digit_indices):
    pairs = []
    labels = []
    
    for d in range(num_classes):
        n=min([len(digit_indices[d])]) -1
        for i in range(n):
            randomIndiceTrue=-1
            while True:
                if randomIndiceTrue != i:
                    break
                else:
                    randomIndiceTrue = random.randrange(0, min([len(digit_indices[d])]))
            
            z1, z2 = digit_indices[d][i], digit_indices[d][randomIndiceTrue]
            pairs += [[x[z1], x[z2]]]
            
            inc = random.randrange(1, num_classes)
            dn = (d + inc) % num_classes
            
            randomIndiceFalse = random.randrange(0, min([len(digit_indices[dn])]))
            
            z1, z2 = digit_indices[d][i], digit_indices[dn][randomIndiceFalse]
            pairs += [[x[z1], x[z2]]]
            labels += [1,0]
    return np.array(pairs), np.array(labels)



def create_base_net_new2(input_shape):
    height=439
    depth=1
    num_classes=9
    input = Input(shape = input_shape)
    x = Reshape((height, depth))(input)
    x = Conv1D(12, 3, strides=1, padding='valid', activation='relu')(x)
    x = AveragePooling1D(3)(x)
    x = Conv1D(8, 3, strides=1, padding='valid', activation='relu')(x)
    x = AveragePooling1D(3)(x)
    x = Conv1D(6, 3, strides=1, padding='valid', activation='relu')(x)
    x = Flatten()(x)
    x = Dense(num_classes, activation = 'relu')(x)
    model = Model(input, x)
    model.summary()
    return model


def get_f1(y_true, y_pred): 
    pred = K.cast(y_pred < 0.5, y_true.dtype)
    true_positives = K.sum(K.round(K.clip(y_true * pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall)
    return f1_val

def compute_f1(y_true, y_pred):
    pred = y_pred.ravel() < 0.5
    pred=pred*1
    TN,FP,FN,TP=confusion_matrix(y_true, pred).ravel()

    recall = TP/(TP+FN)
    precision=TP/(TP+FP)

    f1_val = 2*(precision*recall)/(precision+recall)
    return f1_val

    

In [4]:
def compute_cosine_f1(t_pairs, t_y, cosine_threshold=0.4):
    cTP=0
    cFP=0
    cFN=0
    cTN=0
    for i in range(np.shape(t_y)[0]):
        cos_result=cosine_similarity(t_pairs[i, 0].reshape(1, -1), t_pairs[i, 1].reshape(1, -1))
        if t_y[i]==1 and cos_result>=cosine_threshold:
            cTP=cTP+1
        elif t_y[i]!=1 and cos_result>=cosine_threshold:
            cFP=cFP+1
        elif t_y[i]==1 and cos_result<cosine_threshold:
            cFN=cFN+1
        elif t_y[i]!=1 and cos_result<cosine_threshold:
            cTN=cTN+1
        
    recall = cTP/(cTP+cFN) if (cTP+cFN) > 0 else 0
    
    precision=cTP/(cTP+cFP) if (cTP+cFP) > 0 else 0
    
    if (precision+recall) == 0:
        f1_cosine_val = 0
    else:
        f1_cosine_val = 2*(precision*recall)/(precision+recall)
    return f1_cosine_val



In [5]:
def manhattan_distance(point1, point2):
    return np.sum(np.abs(point1 - point2))

def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def compute_manhattan_f1(t_pairs, t_y, manhattan_threshold=0.85):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    # Max-Min Normalizasyonu
    t_pairs_normalized = np.array([min_max_normalize(pair) for pair in t_pairs])
    
    for i in range(np.shape(t_y)[0]):
        result = manhattan_distance(t_pairs_normalized[i, 0], t_pairs_normalized[i, 1])
        if t_y[i] == 1 and result >= manhattan_threshold:
            cTP += 1
        elif t_y[i] != 1 and result >= manhattan_threshold:
            cFP += 1
        elif t_y[i] == 1 and result < manhattan_threshold:
            cFN += 1
        elif t_y[i] != 1 and result < manhattan_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_manhattan_val = 0
    else:
        f1_manhattan_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_manhattan_val


In [6]:
def euclidean_distance(point1, point2):
    return np.linalg.norm(point1 - point2)

def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def compute_euclidean_f1(t_pairs, t_y, euclidean_threshold=0.7):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    # Max-Min Normalizasyonu
    t_pairs_normalized = np.array([min_max_normalize(pair) for pair in t_pairs])
    
    for i in range(np.shape(t_y)[0]):
        result = euclidean_distance(t_pairs_normalized[i, 0], t_pairs_normalized[i, 1])
        if t_y[i] == 1 and result >= euclidean_threshold:
            cTP += 1
        elif t_y[i] != 1 and result >= euclidean_threshold:
            cFP += 1
        elif t_y[i] == 1 and result < euclidean_threshold:
            cFN += 1
        elif t_y[i] != 1 and result < euclidean_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_euclidean_val = 0
    else:
        f1_euclidean_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_euclidean_val


In [7]:
from scipy.spatial.distance import canberra

def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def compute_canberra_f1(t_pairs, t_y, canberra_threshold=0.85):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    for i in range(np.shape(t_y)[0]):
        # Apply Max-Min normalization to each pair
        t_pair_normalized = np.array([min_max_normalize(pair) for pair in t_pairs[i]])
        
        # Calculate Canberra distance after normalization
        result = canberra(t_pair_normalized[0].flatten(), t_pair_normalized[1].flatten())  # Flatten to 1-D arrays
        
        if t_y[i] == 1 and result >= canberra_threshold:
            cTP += 1
        elif t_y[i] != 1 and result >= canberra_threshold:
            cFP += 1
        elif t_y[i] == 1 and result < canberra_threshold:
            cFN += 1
        elif t_y[i] != 1 and result < canberra_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_canberra_val = 0
    else:
        f1_canberra_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_canberra_val


In [8]:
from scipy.stats import pearsonr

def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def compute_pearsonr_f1(t_pairs, t_y, pearsonr_threshold=0.3):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    for i in range(np.shape(t_y)[0]):
        # Apply min-max normalization to each pair
        t_pair_normalized = np.array([min_max_normalize(pair) for pair in t_pairs[i]])
        
        # Flatten the 2-D arrays to 1-D arrays
        t_pair_normalized_1d = t_pair_normalized[0].flatten()
        t_pair_normalized_2d = t_pair_normalized[1].flatten()
        
        # Calculate the Pearson correlation coefficient
        result, _ = pearsonr(t_pair_normalized_1d, t_pair_normalized_2d)
        
        if t_y[i] == 1 and abs(result) >= pearsonr_threshold:
            cTP += 1
        elif t_y[i] != 1 and abs(result) >= pearsonr_threshold:
            cFP += 1
        elif t_y[i] == 1 and abs(result) < pearsonr_threshold:
            cFN += 1
        elif t_y[i] != 1 and abs(result) < pearsonr_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_pearsonr_val = 0
    else:
        f1_pearsonr_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_pearsonr_val


In [9]:
from scipy.spatial.distance import braycurtis

def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def compute_bray_curtis_f1(t_pairs, t_y, bray_curtis_threshold=0.7):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    for i in range(np.shape(t_y)[0]):
        # Apply min-max normalization to each pair
        t_pair_normalized = np.array([min_max_normalize(pair) for pair in t_pairs[i]])
        
        # Flatten the arrays to 1-D
        t_pair_normalized_1d = t_pair_normalized[0].flatten()
        t_pair_normalized_2d = t_pair_normalized[1].flatten()
        
        # Calculate the Bray-Curtis dissimilarity
        result = braycurtis(t_pair_normalized_1d, t_pair_normalized_2d)
        
        if t_y[i] == 1 and result <= bray_curtis_threshold:
            cTP += 1
        elif t_y[i] != 1 and result <= bray_curtis_threshold:
            cFP += 1
        elif t_y[i] == 1 and result > bray_curtis_threshold:
            cFN += 1
        elif t_y[i] != 1 and result > bray_curtis_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_bray_curtis_val = 0
    else:
        f1_bray_curtis_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_bray_curtis_val


In [10]:
def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data

def calculate_jaccard_similarity(set1, set2):
    intersection = np.sum(set1 & set2)
    union = np.sum(set1 | set2)
    if union == 0:
        return 0.0
    return intersection / union

def compute_jaccard_f1(t_pairs, t_y, jaccard_threshold=0.4):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    for i in range(np.shape(t_y)[0]):
        # Apply min-max normalization to each pair
        t_pair_normalized = np.array([min_max_normalize(pair) for pair in t_pairs[i]])
        
        # Flatten the arrays to 1-D and convert to binary (0 or 1)
        t_pair_normalized_1d = (t_pair_normalized[0] >= 0.5).astype(int)
        t_pair_normalized_2d = (t_pair_normalized[1] >= 0.5).astype(int)
        
        # Calculate the Jaccard similarity
        result = calculate_jaccard_similarity(t_pair_normalized_1d, t_pair_normalized_2d)
        
        if t_y[i] == 1 and result >= jaccard_threshold:
            cTP += 1
        elif t_y[i] != 1 and result >= jaccard_threshold:
            cFP += 1
        elif t_y[i] == 1 and result < jaccard_threshold:
            cFN += 1
        elif t_y[i] != 1 and result < jaccard_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_jaccard_val = 0
    else:
        f1_jaccard_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_jaccard_val


In [11]:
def min_max_normalize(data):
    min_val = np.min(data)
    max_val = np.max(data)
    normalized_data = (data - min_val) / (max_val - min_val)
    return normalized_data


def compute_hamming_f1(t_pairs, t_y, hamming_threshold=0.9):
    cTP = 0
    cFP = 0
    cFN = 0
    cTN = 0
    
    for i in range(np.shape(t_y)[0]):
        # Apply min-max normalization to each pair
        t_pair_normalized = np.array([min_max_normalize(pair) for pair in t_pairs[i]])
        
        # Flatten the arrays to 1-D and convert to binary (0 or 1)
        t_pair_normalized_1d = (t_pair_normalized[0] >= 0.5).astype(int)
        t_pair_normalized_2d = (t_pair_normalized[1] >= 0.5).astype(int)
        
        # Calculate Hamming distance
        result = np.sum(t_pair_normalized_1d != t_pair_normalized_2d) / len(t_pair_normalized_1d)
        
        if t_y[i] == 1 and result <= hamming_threshold:
            cTP += 1
        elif t_y[i] != 1 and result <= hamming_threshold:
            cFP += 1
        elif t_y[i] == 1 and result > hamming_threshold:
            cFN += 1
        elif t_y[i] != 1 and result > hamming_threshold:
            cTN += 1
        
    recall = cTP / (cTP + cFN) if (cTP + cFN) > 0 else 0
    precision = cTP / (cTP + cFP) if (cTP + cFP) > 0 else 0
    
    if (precision + recall) == 0:
        f1_hamming_val = 0
    else:
        f1_hamming_val = 2 * (precision * recall) / (precision + recall)
    
    return f1_hamming_val


In [12]:
#Dataset
df=pd.read_csv('2_case_study_no_name_new.csv',delimiter=';',header=None)
df
# Table names are anonymized.
# First column: TABLE_UniqueTableNumber_ModuleNumber
# Second column: Class number. Equal to Module Number-1. There are 12 classes in totally.
# Third column: Unique table number

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,432,433,434,435,436,437,438,439,440,441
0,TABLE_1_0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TABLE_2_0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TABLE_3_0,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TABLE_4_0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TABLE_5_0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,TABLE_86_8,8,86,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86,TABLE_87_8,8,87,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87,TABLE_88_8,8,88,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88,TABLE_89_8,8,89,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Separation X and Y values
Y = df[1]

Y=Y.values

# Define groups for GroupKFold (using unique table numbers from column 2)
groups = df[2].values

#First three columns removed.
X = df
X = X.drop(0,axis=1)
X = X.drop(1,axis=1)
X = X.drop(2,axis=1)

X=X.values

In [14]:
# Siamese Neural Network Training & Validation Step (Process Group 1)


kfold = GroupKFold(n_splits=5)
cvTrainscores = []
cvTestscores = []
cvi=0

cvTrainscoresCosine = []
cvTestscoresCosine = []

cvTrainscoresManhattan = []
cvTestscoresManhattan = []


cvTrainscoresEuclidean = []
cvTestscoresEuclidean = []

cvTrainscoresPearsonR = []
cvTestscoresPearsonR = []

cvTrainscoresBrayCurtis = []
cvTestscoresBrayCurtis = []

cvTrainscoresJaccard = []
cvTestscoresJaccard= []

cvTrainscoresHamming = []
cvTestscoresHamming = []

cvTrainscoresCanberra = []
cvTestscoresCanberra = []


for train, test in kfold.split(X, Y, groups):

    cvi=cvi+1
    print("n-fold: "+str(cvi))
    
    x_train = X[train]
    x_test = X[test]
    y_train = Y[train]
    y_test = Y[test]
    
    x_train = x_train.reshape(x_train.shape[0], 439, 1)
    x_test = x_test.reshape(x_test.shape[0], 439, 1)
    input_shape = (439, 1)
    print(x_train.shape)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
    input_shape = (439,1)
    
    print(input_shape)
    
    # create training+test positive and negative pairs
    digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
    tr_pairs, tr_y = create_pairs_new3(x_train, digit_indices)
    
    digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
    te_pairs, te_y = create_pairs_new3(x_test, digit_indices)
    
    # network definition
    base_network = create_base_net_new2(input_shape)
    
    print(input_shape)

    input_a = Input(shape=input_shape)
    input_b = Input(shape=input_shape)
    
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclid_dis,
                  output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model = Model([input_a, input_b], distance)

    #train
    model.compile(loss=contrastive_loss, optimizer='adam', metrics=[get_f1])

    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
          batch_size=16,
          epochs=epochs,
          validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y))
    
    # compute final accuracy on training and test sets for n-fold
    p=2
    y_pred = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    tr_acc = compute_f1(tr_y, y_pred)
    tr_f1_cosine = compute_cosine_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_manhattan = compute_manhattan_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_euclidean = compute_euclidean_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_pearsonr = compute_pearsonr_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_braycurtis = compute_bray_curtis_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_hamming = compute_hamming_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_jaccard = compute_jaccard_f1(tr_pairs,tr_y)  # Uses default threshold
    tr_f1_canberra = compute_canberra_f1(tr_pairs,tr_y)  # Uses default threshold

    y_pred = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    te_acc = compute_f1(te_y, y_pred)
    te_f1_cosine = compute_cosine_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_manhattan = compute_manhattan_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_euclidean = compute_euclidean_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_pearsonr = compute_pearsonr_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_braycurtis = compute_bray_curtis_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_hamming = compute_hamming_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_jaccard = compute_jaccard_f1(te_pairs,te_y)  # Uses default threshold
    te_f1_canberra = compute_canberra_f1(te_pairs,te_y)  # Uses default threshold

    cvTrainscores.append(100 * tr_acc)
    cvTestscores.append(100 * te_acc)

    cvTrainscoresCosine.append(100 * tr_f1_cosine)
    cvTestscoresCosine.append(100 * te_f1_cosine)

    cvTrainscoresManhattan.append(100 * tr_f1_manhattan)
    cvTestscoresManhattan.append(100 * te_f1_manhattan)

    cvTrainscoresEuclidean.append(100 * tr_f1_euclidean)
    cvTestscoresEuclidean.append(100 * te_f1_euclidean)

    cvTrainscoresPearsonR.append(100 * tr_f1_pearsonr)
    cvTestscoresPearsonR.append(100 * te_f1_pearsonr)

    cvTrainscoresBrayCurtis.append(100 * tr_f1_braycurtis)
    cvTestscoresBrayCurtis.append(100 * te_f1_braycurtis)


    cvTrainscoresJaccard.append(100 * tr_f1_jaccard)
    cvTestscoresJaccard.append(100 * te_f1_jaccard)

    cvTrainscoresHamming.append(100 * tr_f1_hamming)
    cvTestscoresHamming.append(100 * te_f1_hamming)
    
    cvTrainscoresCanberra.append(100 * tr_f1_canberra)
    cvTestscoresCanberra.append(100 * te_f1_canberra)

    
    



n-fold: 1
(72, 439, 1)
(439, 1)


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 439, 1)]          0         
                                                                 
 reshape (Reshape)           (None, 439, 1)            0         
                                                                 
 conv1d (Conv1D)             (None, 437, 12)           48        
                                                                 
 average_pooling1d (Average  (None, 145, 12)           0         
 Pooling1D)                                                      
                                                                 
 conv1d_1 (Conv1D)           (None, 143, 8)            296       
                                                                 
 average_pooling1d_1 (Avera  (None, 47, 8)             0         
 gePooling1D)              

In [15]:
# Print final F1 Score accuracy on training and test sets based on Cross Validation
print("Siamese Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscores), np.std(cvTrainscores)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscores), np.std(cvTestscores)))

print("Cosine Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresCosine), np.std(cvTrainscoresCosine)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresCosine), np.std(cvTestscoresCosine)))

print("Manhahttan Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresManhattan), np.std(cvTrainscoresManhattan)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresManhattan), np.std(cvTestscoresManhattan)))

print("Euclidean Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresEuclidean), np.std(cvTrainscoresEuclidean)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresEuclidean), np.std(cvTestscoresEuclidean)))

print("Bray-Curtis Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresBrayCurtis), np.std(cvTrainscoresBrayCurtis)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresBrayCurtis), np.std(cvTestscoresBrayCurtis)))

print("PearsonR Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresPearsonR), np.std(cvTrainscoresPearsonR)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresPearsonR), np.std(cvTestscoresPearsonR)))

print("Jaccard Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresJaccard), np.std(cvTrainscoresJaccard)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresJaccard), np.std(cvTestscoresJaccard)))

print("Hamming Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresHamming), np.std(cvTrainscoresHamming)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresHamming), np.std(cvTestscoresHamming)))

print("Canberra Cross Validation F1 Result:")
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresCanberra), np.std(cvTrainscoresCanberra)))
print("Test: %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresCanberra), np.std(cvTestscoresCanberra)))


Siamese Cross Validation F1 Result:
Train: 94.40% (+/- 2.41%)
Test: 77.38% (+/- 15.57%)
Cosine Cross Validation F1 Result:
Train: 82.73% (+/- 2.31%)
Test: 80.33% (+/- 8.01%)
Manhahttan Cross Validation F1 Result:
Train: 66.67% (+/- 0.00%)
Test: 66.67% (+/- 0.00%)
Euclidean Cross Validation F1 Result:
Train: 66.67% (+/- 0.00%)
Test: 66.67% (+/- 0.00%)
Bray-Curtis Cross Validation F1 Result:
Train: 83.06% (+/- 0.86%)
Test: 81.22% (+/- 4.55%)
PearsonR Cross Validation F1 Result:
Train: 81.80% (+/- 0.51%)
Test: 81.22% (+/- 4.55%)
Jaccard Cross Validation F1 Result:
Train: 57.58% (+/- 2.61%)
Test: 49.23% (+/- 19.56%)
Hamming Cross Validation F1 Result:
Train: 66.67% (+/- 0.00%)
Test: 66.67% (+/- 0.00%)
Canberra Cross Validation F1 Result:
Train: 66.67% (+/- 0.00%)
Test: 66.67% (+/- 0.00%)


In [29]:
# F1 Score Results Summary and Ranking
import pandas as pd

# Collect all F1 scores
f1_results = {
    'Method': ['Siamese', 'Cosine', 'Manhattan', 'Euclidean', 'Bray-Curtis', 'PearsonR', 'Jaccard', 'Hamming', 'Canberra'],
    'Train_Mean': [np.mean(cvTrainscores), np.mean(cvTrainscoresCosine), np.mean(cvTrainscoresManhattan), 
                   np.mean(cvTrainscoresEuclidean), np.mean(cvTrainscoresBrayCurtis), np.mean(cvTrainscoresPearsonR),
                   np.mean(cvTrainscoresJaccard), np.mean(cvTrainscoresHamming), np.mean(cvTrainscoresCanberra)],
    'Train_Std': [np.std(cvTrainscores), np.std(cvTrainscoresCosine), np.std(cvTrainscoresManhattan),
                  np.std(cvTrainscoresEuclidean), np.std(cvTrainscoresBrayCurtis), np.std(cvTrainscoresPearsonR),
                  np.std(cvTrainscoresJaccard), np.std(cvTrainscoresHamming), np.std(cvTrainscoresCanberra)],
    'Test_Mean': [np.mean(cvTestscores), np.mean(cvTestscoresCosine), np.mean(cvTestscoresManhattan),
                  np.mean(cvTestscoresEuclidean), np.mean(cvTestscoresBrayCurtis), np.mean(cvTestscoresPearsonR),
                  np.mean(cvTestscoresJaccard), np.mean(cvTestscoresHamming), np.mean(cvTestscoresCanberra)],
    'Test_Std': [np.std(cvTestscores), np.std(cvTestscoresCosine), np.std(cvTestscoresManhattan),
                 np.std(cvTestscoresEuclidean), np.std(cvTestscoresBrayCurtis), np.std(cvTestscoresPearsonR),
                 np.std(cvTestscoresJaccard), np.std(cvTestscoresHamming), np.std(cvTestscoresCanberra)]
}

# Create DataFrame
df_results = pd.DataFrame(f1_results)

# Sort by Test Mean F1 Score (descending)
df_sorted = df_results.sort_values('Test_Mean', ascending=False).reset_index(drop=True)

print("="*80)
print("F1 SCORE RANKING (Sorted by Test Performance)")
print("="*80)
print(f"{'Rank':<5} {'Method':<12} {'Train Mean':<12} {'Train Std':<12} {'Test Mean':<12} {'Test Std':<12}")
print("-"*80)

for i, row in df_sorted.iterrows():
    print(f"{i+1:<5} {row['Method']:<12} {row['Train_Mean']:<12.4f} {row['Train_Std']:<12.4f} {row['Test_Mean']:<12.4f} {row['Test_Std']:<12.4f}")

print("\n" + "="*80)
print("TOP 3 METHODS BY TEST F1 SCORE:")
print("="*80)
for i in range(min(3, len(df_sorted))):
    method = df_sorted.iloc[i]['Method']
    test_mean = df_sorted.iloc[i]['Test_Mean']
    test_std = df_sorted.iloc[i]['Test_Std']
    print(f"{i+1}. {method}: {test_mean:.4f} ± {test_std:.4f}")

print("\n" + "="*80)
print("BEST PERFORMING METHOD:")
print("="*80)
best_method = df_sorted.iloc[0]
print(f"Method: {best_method['Method']}")
print(f"Test F1 Score: {best_method['Test_Mean']:.4f} ± {best_method['Test_Std']:.4f}")
print(f"Train F1 Score: {best_method['Train_Mean']:.4f} ± {best_method['Train_Std']:.4f}")


F1 SCORE RANKING (Sorted by Test Performance)
Rank  Method       Train Mean   Train Std    Test Mean    Test Std    
--------------------------------------------------------------------------------
1     PearsonR     84.3510      1.8065       81.7698      7.4289      
2     Cosine       83.2008      2.1579       81.4510      7.2389      
3     Bray-Curtis  84.7506      1.1585       81.0079      7.7594      
4     Siamese      95.3863      0.8330       78.9412      8.5787      
5     Manhattan    66.6667      0.0000       66.6667      0.0000      
6     Euclidean    66.6667      0.0000       66.6667      0.0000      
7     Hamming      66.6667      0.0000       66.6667      0.0000      
8     Canberra     66.6667      0.0000       66.6667      0.0000      
9     Jaccard      56.3202      5.0800       54.5255      12.1794     

TOP 3 METHODS BY TEST F1 SCORE:
1. PearsonR: 81.7698 ± 7.4289
2. Cosine: 81.4510 ± 7.2389
3. Bray-Curtis: 81.0079 ± 7.7594

BEST PERFORMING METHOD:
Method: Pearso

In [30]:
# Threshold Analysis Setup - Fixed for Different Metric Types
import numpy as np
import pandas as pd

# Define different threshold ranges for different metric types
# Similarity metrics (0-1 range): Cosine, PearsonR, BrayCurtis, Jaccard, Hamming
similarity_thresholds = np.arange(0, 1.05, 0.05)

# Distance metrics (0-50 range): Manhattan, Canberra  
distance_thresholds = np.arange(0, 51, 2.5)

# Euclidean distance (0-10 range)
euclidean_thresholds = np.arange(0, 10.5, 0.5)

print(f"Similarity metrics threshold values: {len(similarity_thresholds)} values (0.0-1.0)")
print(f"Distance metrics threshold values: {len(distance_thresholds)} values (0.0-50.0)")
print(f"Euclidean threshold values: {len(euclidean_thresholds)} values (0.0-10.0)")

# Results storage for all metrics
threshold_results = {
    'Cosine': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Manhattan': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Euclidean': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'PearsonR': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'BrayCurtis': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Jaccard': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Hamming': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Canberra': {'thresholds': [], 'train_f1': [], 'test_f1': []}
}

print("Threshold values and results storage initialized successfully!")


Similarity metrics threshold values: 21 values (0.0-1.0)
Distance metrics threshold values: 21 values (0.0-50.0)
Euclidean threshold values: 21 values (0.0-10.0)
Threshold values and results storage initialized successfully!


In [99]:
# Display Comprehensive Results for All Thresholds and Metrics
print("\n" + "="*100)
print("COMPREHENSIVE THRESHOLD ANALYSIS RESULTS")
print("="*100)

# Create detailed results table
results_data = []

for metric_name in threshold_results.keys():
    thresholds = threshold_results[metric_name]['thresholds']
    train_f1s = threshold_results[metric_name]['train_f1']
    test_f1s = threshold_results[metric_name]['test_f1']
    
    for i, threshold in enumerate(thresholds):
        results_data.append({
            'Metric': metric_name,
            'Threshold': threshold,
            'Train_F1': train_f1s[i],
            'Test_F1': test_f1s[i]
        })

# Create DataFrame
df_threshold_results = pd.DataFrame(results_data)

# Display results for each metric
for metric_name in threshold_results.keys():
    print(f"\n{metric_name.upper()} METRIC RESULTS:")
    print("-" * 60)
    metric_data = df_threshold_results[df_threshold_results['Metric'] == metric_name]
    
    # Find best threshold for this metric
    best_idx = metric_data['Test_F1'].idxmax()
    best_threshold = metric_data.loc[best_idx, 'Threshold']
    best_test_f1 = metric_data.loc[best_idx, 'Test_F1']
    best_train_f1 = metric_data.loc[best_idx, 'Train_F1']
    
    print(f"Best Threshold: {best_threshold:.2f}")
    print(f"Best Test F1: {best_test_f1:.4f}")
    print(f"Best Train F1: {best_train_f1:.4f}")
    
    # Show all thresholds for this metric
    print(f"\nAll Thresholds for {metric_name}:")
    print(f"{'Threshold':<12} {'Test F1':<12}")
    print("-" * 25)
    for _, row in metric_data.iterrows():
        print(f"{row['Threshold']:<12.2f} {row['Test_F1']:<12.4f}")

print("\n" + "="*100)
print("OVERALL BEST PERFORMANCE BY METRIC")
print("="*100)

# Find best performance for each metric
best_performances = []
for metric_name in threshold_results.keys():
    metric_data = df_threshold_results[df_threshold_results['Metric'] == metric_name]
    best_idx = metric_data['Test_F1'].idxmax()
    best_performances.append({
        'Metric': metric_name,
        'Best_Threshold': metric_data.loc[best_idx, 'Threshold'],
        'Best_Test_F1': metric_data.loc[best_idx, 'Test_F1'],
        'Best_Train_F1': metric_data.loc[best_idx, 'Train_F1']
    })

# Sort by best test F1 score
best_performances_df = pd.DataFrame(best_performances)
best_performances_df = best_performances_df.sort_values('Best_Test_F1', ascending=False)

print(f"{'Rank':<6} {'Metric':<12} {'Best Threshold':<15} {'Test F1':<12}")
print("-" * 50)
for i, (_, row) in enumerate(best_performances_df.iterrows()):
    print(f"{i+1:<6} {row['Metric']:<12} {row['Best_Threshold']:<15.2f} {row['Best_Test_F1']:<12.4f}")

print("\n" + "="*100)
print("SUMMARY")
print("="*100)
print(f"Total threshold values tested: {len(threshold_values)}")
print(f"Threshold range: {threshold_values[0]:.2f} to {threshold_values[-1]:.2f}")
print(f"Step size: {threshold_values[1] - threshold_values[0]:.2f}")
print(f"Total metrics tested: {len(threshold_results)}")
print(f"Best overall metric: {best_performances_df.iloc[0]['Metric']}")
print(f"Best overall threshold: {best_performances_df.iloc[0]['Best_Threshold']:.2f}")
print(f"Best overall test F1: {best_performances_df.iloc[0]['Best_Test_F1']:.4f}")



COMPREHENSIVE THRESHOLD ANALYSIS RESULTS

COSINE METRIC RESULTS:
------------------------------------------------------------
Best Threshold: 0.00
Best Test F1: 0.6667
Best Train F1: 0.6667

All Thresholds for Cosine:
Threshold    Test F1     
-------------------------
0.00         0.6667      
1.00         0.0000      
2.00         0.0000      
3.00         0.0000      
4.00         0.0000      
5.00         0.0000      
6.00         0.0000      
7.00         0.0000      
8.00         0.0000      
9.00         0.0000      
10.00        0.0000      
11.00        0.0000      
12.00        0.0000      
13.00        0.0000      
14.00        0.0000      
15.00        0.0000      
16.00        0.0000      
17.00        0.0000      
18.00        0.0000      
19.00        0.0000      
20.00        0.0000      
21.00        0.0000      
22.00        0.0000      
23.00        0.0000      
24.00        0.0000      
25.00        0.0000      
26.00        0.0000      
27.00        0.0000      
2

In [84]:
# Fixed Siamese Network Training - Single Training with Fixed Seed
print("="*80)
print("FIXED SIAMESE NETWORK TRAINING")
print("="*80)

import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import backend as K

# Set fixed random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

print("Random seeds set for reproducibility:")
print("• NumPy seed: 42")
print("• Python random seed: 42") 
print("• TensorFlow seed: 42")

# Clear any existing model state
K.clear_session()

# Siamese Network Training with Fixed Parameters
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cvTrainscoresSiamese = []
cvTestscoresSiamese = []

print(f"\nTraining Siamese Network with fixed parameters:")
print(f"• Epochs: {epochs}")
print(f"• Batch size: 16")
print(f"• Cross-validation: 5-fold")
print(f"• Random state: 42")

cvi = 0
for train, test in kfold.split(X, Y):
    cvi += 1
    print(f"\nFold {cvi}/5:")
    
    # Prepare data
    x_train = X[train]
    x_test = X[test]
    y_train = Y[train]
    y_test = Y[test]
    
    x_train = x_train.reshape(x_train.shape[0], 439, 1)
    x_test = x_test.reshape(x_test.shape[0], 439, 1)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    y_train = y_train.astype('float32')
    y_test = y_test.astype('float32')
    
    # Create pairs
    digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
    tr_pairs, tr_y = create_pairs_new3(x_train, digit_indices)
    
    digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
    te_pairs, te_y = create_pairs_new3(x_test, digit_indices)
    
    # Create fresh model for each fold
    base_network = create_base_net_new2((439, 1))
    input_a = Input(shape=(439, 1))
    input_b = Input(shape=(439, 1))
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    distance = Lambda(euclid_dis, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    model = Model([input_a, input_b], distance)
    
    # Compile model
    model.compile(loss=contrastive_loss, optimizer='adam', metrics=[get_f1])
    
    # Train model
    model.fit([tr_pairs[:, 0], tr_pairs[:, 1]], tr_y,
              batch_size=16,
              epochs=epochs,
              validation_data=([te_pairs[:, 0], te_pairs[:, 1]], te_y),
              verbose=0)
    
    # Evaluate model
    y_pred_train = model.predict([tr_pairs[:, 0], tr_pairs[:, 1]])
    y_pred_test = model.predict([te_pairs[:, 0], te_pairs[:, 1]])
    
    tr_acc = compute_f1(tr_y, y_pred_train)
    te_acc = compute_f1(te_y, y_pred_test)
    
    cvTrainscoresSiamese.append(100 * tr_acc)
    cvTestscoresSiamese.append(100 * te_acc)
    
    print(f"  Train F1: {tr_acc:.4f} ({100 * tr_acc:.2f}%)")
    print(f"  Test F1:  {te_acc:.4f} ({100 * te_acc:.2f}%)")
    
    # Clear model to free memory
    del model
    K.clear_session()

print("\n" + "="*80)
print("FIXED SIAMESE NETWORK RESULTS")
print("="*80)
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresSiamese), np.std(cvTrainscoresSiamese)))
print("Test:  %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresSiamese), np.std(cvTestscoresSiamese)))

print("\nComparison with Original Results:")
print("Original - Train: 95.90% (+/- 1.29%), Test: 82.84% (+/- 9.77%)")
print("Fixed    - Train: %.2f%% (+/- %.2f%%), Test: %.2f%% (+/- %.2f%%)" % 
      (np.mean(cvTrainscoresSiamese), np.std(cvTrainscoresSiamese),
       np.mean(cvTestscoresSiamese), np.std(cvTestscoresSiamese)))


FIXED SIAMESE NETWORK TRAINING
Random seeds set for reproducibility:
• NumPy seed: 42
• Python random seed: 42
• TensorFlow seed: 42

Training Siamese Network with fixed parameters:
• Epochs: 20
• Batch size: 16
• Cross-validation: 5-fold
• Random state: 42

Fold 1/5:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 439, 1)]          0         
                                                                 
 reshape (Reshape)           (None, 439, 1)            0         
                                                                 
 conv1d (Conv1D)             (None, 437, 12)           48        
                                                                 
 average_pooling1d (Average  (None, 145, 12)           0         
 Pooling1D)                                                      
                                                        

In [None]:
# Optimized Threshold Analysis - Distance Metrics Only (No Siamese)
print("="*80)
print("OPTIMIZED THRESHOLD ANALYSIS - DISTANCE METRICS ONLY")
print("="*80)

# Clear any existing state
K.clear_session()

# Set fixed random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Use the same cross-validation setup as before
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Results storage for distance metrics only
distance_threshold_results = {
    'Cosine': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Manhattan': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Euclidean': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'PearsonR': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'BrayCurtis': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Jaccard': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Hamming': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Canberra': {'thresholds': [], 'train_f1': [], 'test_f1': []}
}

print("Testing distance metrics only (no Siamese training):")
print(f"• Threshold values: {len(threshold_values)} (0.00 to 1.00)")
print("• Cross-validation: 5-fold with fixed seed")
print("• Focus: Distance metrics threshold optimization")

# Test each threshold value
for threshold in threshold_values:
    print(f"\nTesting threshold: {threshold:.2f}")
    print("-" * 50)
    
    # Initialize lists for this threshold
    train_scores = {'Cosine': [], 'Manhattan': [], 'Euclidean': [], 'PearsonR': [], 
                   'BrayCurtis': [], 'Jaccard': [], 'Hamming': [], 'Canberra': []}
    test_scores = {'Cosine': [], 'Manhattan': [], 'Euclidean': [], 'PearsonR': [], 
                  'BrayCurtis': [], 'Jaccard': [], 'Hamming': [], 'Canberra': []}
    
    cvi = 0
    for train, test in kfold.split(X, Y):
        cvi += 1
        print(f"  Fold {cvi}/5", end=" ")
        
        # Prepare data (same as before)
        x_train = X[train]
        x_test = X[test]
        y_train = Y[train]
        y_test = Y[test]
        
        x_train = x_train.reshape(x_train.shape[0], 439, 1)
        x_test = x_test.reshape(x_test.shape[0], 439, 1)
        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        y_train = y_train.astype('float32')
        y_test = y_test.astype('float32')
        
        # Create pairs
        digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
        tr_pairs, tr_y = create_pairs_new3(x_train, digit_indices)
        
        digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
        te_pairs, te_y = create_pairs_new3(x_test, digit_indices)
        
        # Test distance metrics with current threshold (NO SIAMESE TRAINING)
        metrics_functions = {
            'Cosine': compute_cosine_f1,
            'Manhattan': compute_manhattan_f1,
            'Euclidean': compute_euclidean_f1,
            'PearsonR': compute_pearsonr_f1,
            'BrayCurtis': compute_bray_curtis_f1,
            'Jaccard': compute_jaccard_f1,
            'Hamming': compute_hamming_f1,
            'Canberra': compute_canberra_f1
        }
        
        for metric_name, metric_func in metrics_functions.items():
            # Train scores
            train_f1 = metric_func(tr_pairs, tr_y, threshold)
            train_scores[metric_name].append(train_f1)
            
            # Test scores
            test_f1 = metric_func(te_pairs, te_y, threshold)
            test_scores[metric_name].append(test_f1)
    
    # Store results for this threshold
    for metric_name in train_scores.keys():
        distance_threshold_results[metric_name]['thresholds'].append(threshold)
        distance_threshold_results[metric_name]['train_f1'].append(np.mean(train_scores[metric_name]))
        distance_threshold_results[metric_name]['test_f1'].append(np.mean(test_scores[metric_name]))
    
    print("✓")

print("\n" + "="*80)
print("Distance metrics threshold analysis completed!")
print("="*80)


OPTIMIZED THRESHOLD ANALYSIS - DISTANCE METRICS ONLY
Testing distance metrics only (no Siamese training):
• Threshold values: 21 (0.0 to 40.0)
• Cross-validation: 5-fold with fixed seed
• Focus: Distance metrics threshold optimization

Testing threshold: 0.00
--------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold: 0.05
--------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold: 0.10
--------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold: 0.15
--------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold: 0.20
--------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold: 0.25
--------------------------------------------------
  Fold 1/5   Fold 2/5   

In [96]:
# Display Final Results - Siamese + Optimized Distance Metrics
print("\n" + "="*100)
print("FINAL COMPREHENSIVE RESULTS")
print("="*100)

print("\n1. SIAMESE NETWORK (Fixed Training):")
print("-" * 50)
print("Train: %.2f%% (+/- %.2f%%)" % (np.mean(cvTrainscoresSiamese), np.std(cvTrainscoresSiamese)))
print("Test:  %.2f%% (+/- %.2f%%)" % (np.mean(cvTestscoresSiamese), np.std(cvTestscoresSiamese)))

print("\n2. DISTANCE METRICS - OPTIMAL THRESHOLDS:")
print("-" * 50)

# Create detailed results table for distance metrics
distance_results_data = []

for metric_name in distance_threshold_results.keys():
    thresholds = distance_threshold_results[metric_name]['thresholds']
    train_f1s = distance_threshold_results[metric_name]['train_f1']
    test_f1s = distance_threshold_results[metric_name]['test_f1']
    
    for i, threshold in enumerate(thresholds):
        distance_results_data.append({
            'Metric': metric_name,
            'Threshold': threshold,
            'Train_F1': train_f1s[i],
            'Test_F1': test_f1s[i]
        })

# Create DataFrame
df_distance_results = pd.DataFrame(distance_results_data)

# Find best performance for each distance metric
best_distance_performances = []
for metric_name in distance_threshold_results.keys():
    metric_data = df_distance_results[df_distance_results['Metric'] == metric_name]
    best_idx = metric_data['Test_F1'].idxmax()
    best_distance_performances.append({
        'Metric': metric_name,
        'Best_Threshold': metric_data.loc[best_idx, 'Threshold'],
        'Best_Test_F1': metric_data.loc[best_idx, 'Test_F1'],
        'Best_Train_F1': metric_data.loc[best_idx, 'Train_F1']
    })

# Sort by best test F1 score
best_distance_performances_df = pd.DataFrame(best_distance_performances)
best_distance_performances_df = best_distance_performances_df.sort_values('Best_Test_F1', ascending=False)

print(f"{'Rank':<6} {'Metric':<12} {'Best Threshold':<15} {'Test F1':<12}")
print("-" * 50)
for i, (_, row) in enumerate(best_distance_performances_df.iterrows()):
    print(f"{i+1:<6} {row['Metric']:<12} {row['Best_Threshold']:<15.2f} {row['Best_Test_F1']:<12.4f}")

print("\n3. OVERALL RANKING (Including Siamese):")
print("-" * 50)

# Add Siamese to the ranking
overall_results = []
overall_results.append({
    'Method': 'Siamese',
    'Best_Threshold': 'N/A',
    'Best_Test_F1': np.mean(cvTestscoresSiamese) / 100,  # Convert to 0-1 scale
    'Best_Train_F1': np.mean(cvTrainscoresSiamese) / 100
})

# Add distance metrics
for _, row in best_distance_performances_df.iterrows():
    overall_results.append({
        'Method': row['Metric'],
        'Best_Threshold': row['Best_Threshold'],
        'Best_Test_F1': row['Best_Test_F1'],
        'Best_Train_F1': row['Best_Train_F1']
    })

# Sort by test F1 score
overall_results_df = pd.DataFrame(overall_results)
overall_results_df = overall_results_df.sort_values('Best_Test_F1', ascending=False)

print(f"{'Rank':<6} {'Method':<12} {'Best Threshold':<15} {'Test F1':<12}")
print("-" * 50)
for i, (_, row) in enumerate(overall_results_df.iterrows()):
    threshold_str = f"{row['Best_Threshold']:.2f}" if row['Best_Threshold'] != 'N/A' else 'N/A'
    print(f"{i+1:<6} {row['Method']:<12} {threshold_str:<15} {row['Best_Test_F1']:<12.4f}")

print("\n4. SUMMARY:")
print("-" * 50)
print(f"• Siamese Network: {np.mean(cvTestscoresSiamese):.2f}% test F1 (baseline)")
print(f"• Best Distance Metric: {best_distance_performances_df.iloc[0]['Metric']} with {best_distance_performances_df.iloc[0]['Best_Threshold']:.2f} threshold")
print(f"• Best Distance F1: {best_distance_performances_df.iloc[0]['Best_Test_F1']:.4f}")
print(f"• Performance Gap: {np.mean(cvTestscoresSiamese)/100 - best_distance_performances_df.iloc[0]['Best_Test_F1']:.4f}")

print("\n" + "="*100)
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*100)



FINAL COMPREHENSIVE RESULTS

1. SIAMESE NETWORK (Fixed Training):
--------------------------------------------------
Train: 95.60% (+/- 1.43%)
Test:  81.62% (+/- 6.99%)

2. DISTANCE METRICS - OPTIMAL THRESHOLDS:
--------------------------------------------------
Rank   Metric       Best Threshold  Test F1     
--------------------------------------------------
1      PearsonR     0.25            0.8549      
2      Cosine       0.40            0.8421      
3      Jaccard      0.20            0.8300      
4      BrayCurtis   0.75            0.8065      
5      Hamming      0.05            0.7235      
6      Manhattan    0.00            0.6667      
7      Euclidean    0.00            0.6667      
8      Canberra     0.00            0.6667      

3. OVERALL RANKING (Including Siamese):
--------------------------------------------------
Rank   Method       Best Threshold  Test F1     
--------------------------------------------------
1      PearsonR     0.25            0.8549      
2  

In [32]:
# Comprehensive Threshold Testing for All Metrics
print("="*80)
print("COMPREHENSIVE THRESHOLD TESTING - ALL METRICS")
print("="*80)

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Clear any existing state
K.clear_session()

# Cross-validation setup
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Results storage
threshold_results = {
    'Cosine': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Manhattan': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Euclidean': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'PearsonR': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'BrayCurtis': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Jaccard': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Hamming': {'thresholds': [], 'train_f1': [], 'test_f1': []},
    'Canberra': {'thresholds': [], 'train_f1': [], 'test_f1': []}
}

# Metric functions
metrics_functions = {
    'Cosine': compute_cosine_f1,
    'Manhattan': compute_manhattan_f1,
    'Euclidean': compute_euclidean_f1,
    'PearsonR': compute_pearsonr_f1,
    'BrayCurtis': compute_bray_curtis_f1,
    'Jaccard': compute_jaccard_f1,
    'Hamming': compute_hamming_f1,
    'Canberra': compute_canberra_f1
}

print(f"Testing {len(threshold_values)} threshold values: {threshold_values[0]:.2f} to {threshold_values[-1]:.2f}")
print(f"Testing {len(metrics_functions)} metrics: {list(metrics_functions.keys())}")
print(f"Cross-validation: 5-fold")

# Test each threshold value
for threshold_idx, threshold in enumerate(threshold_values):
    print(f"\nTesting threshold {threshold_idx+1}/{len(threshold_values)}: {threshold:.2f}")
    print("-" * 60)
    
    # Initialize lists for this threshold
    train_scores = {metric: [] for metric in metrics_functions.keys()}
    test_scores = {metric: [] for metric in metrics_functions.keys()}
    
    cvi = 0
    for train, test in kfold.split(X, Y):
        cvi += 1
        print(f"  Fold {cvi}/5", end=" ")
        
        # Prepare data
        x_train = X[train]
        x_test = X[test]
        y_train = Y[train]
        y_test = Y[test]
        
        x_train = x_train.reshape(x_train.shape[0], 439, 1)
        x_test = x_test.reshape(x_test.shape[0], 439, 1)
        x_train = x_train.astype('float32')
        x_test = x_test.astype('float32')
        y_train = y_train.astype('float32')
        y_test = y_test.astype('float32')
        
        # Create pairs
        digit_indices = [np.where(y_train == i)[0] for i in range(num_classes)]
        tr_pairs, tr_y = create_pairs_new3(x_train, digit_indices)
        
        digit_indices = [np.where(y_test == i)[0] for i in range(num_classes)]
        te_pairs, te_y = create_pairs_new3(x_test, digit_indices)
        
        # Test each metric with current threshold
        for metric_name, metric_func in metrics_functions.items():
            # Train scores
            train_f1 = metric_func(tr_pairs, tr_y, threshold)
            train_scores[metric_name].append(train_f1)
            
            # Test scores
            test_f1 = metric_func(te_pairs, te_y, threshold)
            test_scores[metric_name].append(test_f1)
    
    # Store results for this threshold
    for metric_name in metrics_functions.keys():
        threshold_results[metric_name]['thresholds'].append(threshold)
        threshold_results[metric_name]['train_f1'].append(np.mean(train_scores[metric_name]))
        threshold_results[metric_name]['test_f1'].append(np.mean(test_scores[metric_name]))
    
    print("✓")

print("\n" + "="*80)
print("THRESHOLD TESTING COMPLETED!")
print("="*80)


COMPREHENSIVE THRESHOLD TESTING - ALL METRICS
Testing 21 threshold values: 0.00 to 1.00
Testing 8 metrics: ['Cosine', 'Manhattan', 'Euclidean', 'PearsonR', 'BrayCurtis', 'Jaccard', 'Hamming', 'Canberra']
Cross-validation: 5-fold

Testing threshold 1/21: 0.00
------------------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold 2/21: 0.05
------------------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold 3/21: 0.10
------------------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold 4/21: 0.15
------------------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold 5/21: 0.20
------------------------------------------------------------
  Fold 1/5   Fold 2/5   Fold 3/5   Fold 4/5   Fold 5/5 ✓

Testing threshold 6/21: 0.25
-

In [33]:
# Display Comprehensive Results for All Thresholds and Metrics
print("\n" + "="*100)
print("COMPREHENSIVE THRESHOLD ANALYSIS RESULTS")
print("="*100)

# Create detailed results table
results_data = []

for metric_name in threshold_results.keys():
    thresholds = threshold_results[metric_name]['thresholds']
    train_f1s = threshold_results[metric_name]['train_f1']
    test_f1s = threshold_results[metric_name]['test_f1']
    
    for i, threshold in enumerate(thresholds):
        results_data.append({
            'Metric': metric_name,
            'Threshold': threshold,
            'Train_F1': train_f1s[i],
            'Test_F1': test_f1s[i]
        })

# Create DataFrame
df_threshold_results = pd.DataFrame(results_data)

# Display results for each metric
print("\nDETAILED RESULTS BY METRIC:")
print("="*100)

for metric_name in threshold_results.keys():
    print(f"\n{metric_name.upper()} METRIC RESULTS:")
    print("-" * 80)
    metric_data = df_threshold_results[df_threshold_results['Metric'] == metric_name]
    
    # Find best threshold for this metric
    best_idx = metric_data['Test_F1'].idxmax()
    best_threshold = metric_data.loc[best_idx, 'Threshold']
    best_test_f1 = metric_data.loc[best_idx, 'Test_F1']
    best_train_f1 = metric_data.loc[best_idx, 'Train_F1']
    
    print(f"Best Threshold: {best_threshold:.2f}")
    print(f"Best Test F1: {best_test_f1:.4f}")
    print(f"Best Train F1: {best_train_f1:.4f}")
    
    # Show all thresholds for this metric
    print(f"\nAll Thresholds for {metric_name}:")
    print(f"{'Threshold':<12} {'Train F1':<12} {'Test F1':<12}")
    print("-" * 40)
    for _, row in metric_data.iterrows():
        print(f"{row['Threshold']:<12.2f} {row['Train_F1']:<12.4f} {row['Test_F1']:<12.4f}")

print("\n" + "="*100)
print("OVERALL BEST PERFORMANCE BY METRIC")
print("="*100)

# Find best performance for each metric
best_performances = []
for metric_name in threshold_results.keys():
    metric_data = df_threshold_results[df_threshold_results['Metric'] == metric_name]
    best_idx = metric_data['Test_F1'].idxmax()
    best_performances.append({
        'Metric': metric_name,
        'Best_Threshold': metric_data.loc[best_idx, 'Threshold'],
        'Best_Test_F1': metric_data.loc[best_idx, 'Test_F1'],
        'Best_Train_F1': metric_data.loc[best_idx, 'Train_F1']
    })

# Sort by best test F1 score
best_performances_df = pd.DataFrame(best_performances)
best_performances_df = best_performances_df.sort_values('Best_Test_F1', ascending=False)

print(f"{'Rank':<6} {'Metric':<12} {'Best Threshold':<15} {'Test F1':<12} {'Train F1':<12}")
print("-" * 70)
for i, (_, row) in enumerate(best_performances_df.iterrows()):
    print(f"{i+1:<6} {row['Metric']:<12} {row['Best_Threshold']:<15.2f} {row['Best_Test_F1']:<12.4f} {row['Best_Train_F1']:<12.4f}")

print("\n" + "="*100)
print("SUMMARY")
print("="*100)
print(f"Total threshold values tested: {len(threshold_values)}")
print(f"Threshold range: {threshold_values[0]:.2f} to {threshold_values[-1]:.2f}")
print(f"Step size: {threshold_values[1] - threshold_values[0]:.2f}")
print(f"Total metrics tested: {len(threshold_results)}")
print(f"Best overall metric: {best_performances_df.iloc[0]['Metric']}")
print(f"Best overall threshold: {best_performances_df.iloc[0]['Best_Threshold']:.2f}")
print(f"Best overall test F1: {best_performances_df.iloc[0]['Best_Test_F1']:.4f}")

print("\n" + "="*100)
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*100)



COMPREHENSIVE THRESHOLD ANALYSIS RESULTS

DETAILED RESULTS BY METRIC:

COSINE METRIC RESULTS:
--------------------------------------------------------------------------------
Best Threshold: 0.40
Best Test F1: 0.8421
Best Train F1: 0.8200

All Thresholds for Cosine:
Threshold    Train F1     Test F1     
----------------------------------------
0.00         0.6667       0.6667      
0.05         0.6790       0.6773      
0.10         0.7440       0.7636      
0.15         0.7866       0.7720      
0.20         0.8172       0.8203      
0.25         0.8245       0.8379      
0.30         0.8282       0.8183      
0.35         0.8127       0.8260      
0.40         0.8200       0.8421      
0.45         0.7765       0.7776      
0.50         0.7486       0.6712      
0.55         0.6824       0.5888      
0.60         0.5475       0.4548      
0.65         0.3809       0.2194      
0.70         0.3304       0.2218      
0.75         0.2032       0.0800      
0.80         0.1248       0.