In [1]:
import numpy as np
import pandas as pd

In [4]:
atchley_df = pd.read_csv("../files/atchley.csv")

# Map amino acids to their feature vectors
atchley_dict = atchley_df.set_index('amino.acid').T.to_dict('list')

In [8]:

# Function to calculate the Poincaré distance in hyperbolic space
def poincare_distance(u, v):
    # Convert vectors to numpy arrays
    u = np.array(u)
    v = np.array(v)
    
    # Squared Euclidean distance between u and v
    euclidean_dist_sq = np.sum((u - v) ** 2)
    
    # Squared norms of u and v
    norm_u_sq = np.sum(u ** 2)
    norm_v_sq = np.sum(v ** 2)
    
    # Compute the Poincaré distance
    dist = np.arccosh(1 + 2 * euclidean_dist_sq / ((1 - norm_u_sq) * (1 - norm_v_sq)))
    
    return dist

# Function to calculate sequence distance based on hyperbolic embeddings
def hyperbolic_sequence_distance(seq1, seq2, atchley_dict):
    total_distance = 0
    for a1, a2 in zip(seq1, seq2):
        if a1 in atchley_dict and a2 in atchley_dict:
            total_distance += poincare_distance(atchley_dict[a1], atchley_dict[a2])
    return total_distance

# Example sequences
seq1 = "AA"
seq2 = "AC"

# Calculate distance between sequences in hyperbolic space
distance = hyperbolic_sequence_distance(seq1, seq2, atchley_dict)
print(f"Hyperbolic Distance between the sequences: {distance}")


Hyperbolic Distance between the sequences: 1.6841664653593127
