.ipynb used to conduct hierarchical cluster analysis of heptamer sequences surrounding cN occurrences (-3 to +3). The original analysis was conducted in Deepnote, so reuse of this code in different platforms (i.e. local, Google Colab) may require tweaking of package imports.

In [None]:
INPUT_FILEPATH = "" # Filepath to input .xlsx file, which should contain a list of heptamer cN sequences. The spreadsheet must include a column of the consensus sequences, the first row of which should be the column name.
COLUMN_NAME = "" # Name of the column containing the list of heptamer cN sequences in the input spreadsheet

OUTPUT_FILEPATH = "" # Filepath to output file

In [None]:
# Sources:
# https://www.bioinformatics.org/sms2/ident_sim.html
# https://github.com/etlioglu/ident_sim/blob/master/ident_sim.py

aliphatic_aa = "GAVLI"
aromatic_aa = "FYW"
thiol_aa = "CM"
alcohol_aa = "ST"
positive_aa = "KRH"
negative_aa = "DENQ"
kinked_aa = "P"

def find_alignment(sequence1, sequence2):

    # Count of amino acids that are similar or identical in the analyzed positions
    identical_count = 0
    similar_count = 0

    # Analyze specifically the N + 1 and N + 2 positions
    for i in range(4, 6):

        if sequence1[i] == sequence2[i]:
            identical_count += 1
            similar_count += 1
        elif sequence1[i] in aliphatic_aa and sequence2[i] in aliphatic_aa:
            similar_count += 1
        elif sequence1[i] in aromatic_aa and sequence2[i] in aromatic_aa:
            similar_count += 1
        elif sequence1[i] in thiol_aa and sequence2[i] in thiol_aa:
            similar_count += 1
        elif sequence1[i] in alcohol_aa and sequence2[i] in alcohol_aa:
            similar_count += 1
        elif sequence1[i] in positive_aa and sequence2[i] in positive_aa:
            similar_count += 1
        elif sequence1[i] in negative_aa and sequence2[i] in negative_aa:
            similar_count += 1

    percent_similar = similar_count / 2 # Division by 2 specifically because two positions were analyzed!
    percent_identical = identical_count / 2 # Division by 2 specifically because two positions were analyzed!
    
    return percent_identical, percent_similar

def find_percent_similarity(sequence1, sequence2):
    return find_alignment(sequence1, sequence2)[1]

def find_percent_identical(sequence1, sequence2):
    return find_alignment(sequence1, sequence2)[0]

In [None]:
import pandas as pd

df = pd.read_excel(INPUT_FILEPATH)

df

In [None]:
# Source: https://gist.github.com/codehacken/8b9316e025beeabb082dda4d0654a6fa

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
import numpy as np

sequences = df[COLUMN_NAME].dropna().to_list()

def distance_similar_identical(x, y):
    sequence1 = sequences[int(x[0])]
    sequence2 = sequences[int(y[0])]
    return 1 - find_percent_similarity(sequence1, sequence2) * find_percent_identical(sequence1, sequence2)

# Reshape the data.
X = np.arange(len(sequences)).reshape(-1, 1)

# Calculate pairwise distances.
distance_matrix_similar_identical = pairwise_distances(X, X, metric=distance_similar_identical)

In [None]:
# Perform agglomerative clustering.
# The affinity is precomputed (since the distance are precalculated).
# Use an 'average' linkage. Use any other apart from  'ward'.
agg = AgglomerativeClustering(n_clusters=3, affinity='precomputed',
                            linkage='average')

predictions_similar_identical = agg.fit_predict(distance_matrix_similar_identical)

assignment_dict = {
'sequence': sequences,
'assignment_similar_identical': predictions_similar_identical,
}

clustering_assignments = pd.DataFrame(assignment_dict)
clustering_assignments.to_excel(OUTPUT_FILEPATH)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=08806ca9-3319-4bb4-9ddc-71a137575411' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>