# Siamese Model

## Upload the Data

### Import supporting libraries

In [1]:
import LoadData
import pandas as pd
from sklearn.model_selection import train_test_split

### Connecting to the Database

In [2]:
DBLoc = "./data/benchmark.db"
conn = LoadData.connectToDB(DBLoc)

Connecting to ./data/benchmark.db .....
Connected to ./data/benchmark.db


### Open the Dataset and Dataloader

In [3]:
data = LoadData.LoadAllEncodings(conn)
data['Encoding1'] = data['Encoding1'].apply(LoadData.deserialize_encoding)
data['Encoding2'] = data['Encoding2'].apply(LoadData.deserialize_encoding)
data.head(10)

Unnamed: 0,Encoding1,Encoding2,AlignmentScore
0,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[2.057203, -0.320799, -1.235001, 2.8408, -0.77...",0.0
1,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[0.219529, 1.371923, -0.711086, 2.675397, -0.4...",0.0
2,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[-0.13575, -0.002365, 0.059046, 0.083796, 0.06...",0.0
3,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[1.703211, 1.607478, -1.28063, 2.984896, -1.82...",0.0
4,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[1.622336, 1.383203, -1.722515, 2.852858, -1.1...",0.0
5,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[0.565803, 1.289719, -0.778099, 0.875891, -0.8...",0.0
6,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[8.166027, -3.384297, -4.185069, 17.54004, -6....",0.0
7,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[1.068295, -0.242165, -0.632848, 1.914043, -0....",0.0
8,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[0.989081, 0.353823, -0.876116, 1.762983, -0.8...",0.0
9,"[5.889621, 1.421312, -1.784085, 8.756901, -2.4...","[1.861915, -0.115179, -0.672746, 4.931495, -1....",0.0


In [4]:
df_zero = data[(data['AlignmentScore'] == 0)]
df_one = data[(data['AlignmentScore'] == 1)]
df_non_zero = data[(data['AlignmentScore'] != 0) & (data['AlignmentScore'] != 1)]

print(f"Number of zero scores: {df_zero.shape[0]}")
print(f"Number of one scores: {df_one.shape[0]}")
print(f"Number of non-zero scores: {df_non_zero.shape[0]}")
print(f"Total number of scores: {df_zero.shape[0] + df_one.shape[0] + df_non_zero.shape[0]}")
print(f"Actual number of scores: {data.shape[0]}")

Number of zero scores: 203833
Number of one scores: 331
Number of non-zero scores: 13924
Total number of scores: 218088
Actual number of scores: 218088


### Split Data into Training and Testing Dataset
Given than we want to predict the functions with the largest alignment scores to merge, we should split it according to functions, so we can try to predict the function it is supposed to align with

In [5]:
testing_percentage = 0.2

# Evenly split the dataset so that there is an even amount of zeroes, ones and float alignment score
training_zero, testing_zero = train_test_split(df_zero, test_size=testing_percentage, random_state=42)
training_one, testing_one = train_test_split(df_one, test_size=testing_percentage, random_state=42)
training_non_zero, testing_non_zero = train_test_split(df_non_zero, test_size=testing_percentage, random_state=42)

In [6]:
training_set = pd.concat([training_zero, training_one, training_non_zero])
testing_set = pd.concat([testing_zero, testing_one, testing_non_zero])
print(training_set.shape)
print(testing_set.shape)

(174469, 3)
(43619, 3)


## Siamese Model

### PyTorch

In [7]:
import torch
from torch.utils.data import DataLoader

In [8]:
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(torch.cuda.current_device())}")
else:
    print("GPU is not available, using CPU")

Using GPU: NVIDIA GeForce RTX 3050 Ti Laptop GPU


#### Load Training Data into DataLoader

In [10]:
training_dataset = LoadData.EncodingDataset(training_set)
training_dataloader = DataLoader(training_dataset)

#### Train Model

In [None]:
for batch in training_dataloader:
    pass

### Tensorflow

In [7]:
import tensorflow as tf
import numpy as np

2025-02-10 13:59:21.012516: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-10 13:59:21.020251: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1739195961.029006   27863 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1739195961.031590   27863 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-10 13:59:21.041386: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [8]:
# Check if tensorflow is using GPU
if tf.config.list_physical_devices('GPU'):
    print("TensorFlow is using GPU")
    print(f"GPU: {tf.config.list_physical_devices('GPU')}")
else:
    print("TensorFlow is not using GPU")

TensorFlow is using GPU
GPU: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


#### Load Training Data

In [9]:
training_set_copy = training_set.copy()
testing_set_copy = testing_set.copy()

In [18]:
# Convert training data into ndarray
Encoding1 = np.vstack(training_set_copy["Encoding1"].values)
Encoding2 = np.vstack(training_set_copy["Encoding2"].values)
AlignmentScore = training_set_copy["AlignmentScore"].to_numpy(dtype=float)

#### Train Model

In [None]:
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# Define the base network for feature extraction
def create_base_network(input_shape):
    inputs = Input(shape=input_shape)
    x = Dense(256, activation='relu')(inputs)
    x = Dense(128, activation='relu')(x)
    outputs = Dense(64, activation='sigmoid')(x)  # Feature vector
    return Model(inputs, outputs)

In [17]:
# Define input shape
input_shape = (300,)

# Create the base network
base_network = create_base_network(input_shape)

# Siamese network inputs
input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

# Generate embeddings
embedding_a = base_network(input_a)
embedding_b = base_network(input_b)

# Compute L1 distance
def l1_distance(vectors):
    x, y = vectors
    return K.abs(x - y)

distance = Lambda(l1_distance)([embedding_a, embedding_b])

# Output layer for similarity score (0 to 1 range)
output = Dense(1, activation='sigmoid')(distance)

# Define the Siamese model
siamese_model = Model(inputs=[input_a, input_b], outputs=output)

# Compile the model
siamese_model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

# Model summary
siamese_model.summary()


In [22]:
# print(f"Encoding1 shape: {Encoding1.shape}")
# print(f"Encoding2 shape: {Encoding2.shape}")
# print(f"AlignmentScore shape: {AlignmentScore.shape}")

history = siamese_model.fit([Encoding1, Encoding2], AlignmentScore, batch_size=32, epochs=10, validation_split=0.1)


Epoch 1/10


E0000 00:00:1739198476.606652   28009 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
E0000 00:00:1739198476.622913   28009 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2025-02-10 14:41:16.625743: W tensorflow/core/framework/op_kernel.cc:1841] OP_REQUIRES failed at xla_ops.cc:577 : FAILED_PRECONDITION: DNN library initialization failed. Look at the er

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/chuongg3/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/chuongg3/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/chuongg3/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/home/chuongg3/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_27863/534955266.py", line 1, in <module>

  File "/home/chuongg3/.local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/chuongg3/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/home/chuongg3/.local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_one_step_on_iterator_3685]