# Functional Clustering

## Importing modules

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import itertools
from collections import Counter
import math

## Finding top binders from dataframes

In [25]:
#Function to find the top binders
def top_binders(peptides, ranks, threshold) :
    if len(peptides) != len(ranks) :
        raise ValueError("Length of peptide names and binding scores must be equal")
    
    significant_binders = []
    
    for i in range(len(peptides)) :
        if ranks[i] <= threshold :
            significant_binders.append((peptides[i], ranks[i]))
    
    return significant_binders

## Parsing dataframes

In [35]:
allele_names = pd.read_csv("alleles.csv")
viruses = pd.read_csv("viruses.csv")
significant_binders_dictionary = {}

threshold = 1  #Threshold for significance in %

for allele in allele_names :
    significant_binders_dictionary[allele] = {}
    dataframe = pd.read_csv('Dataframes/' + allele + '_df.csv').groupby("Virus")
    for virus in viruses :
        significant_binders_dictionary[allele][virus] = list(zip(*top_binders(list(dataframe.get_group(virus)["Peptide"]), list(dataframe.get_group(virus)["Computed Rank"]), threshold)))[0]

## Cosine similarities

In [36]:
#Compute the cosine similarity between two lists - function from https://stackoverflow.com/questions/14720324/compute-the-similarity-between-two-lists
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

cosine_similarity_df = pd.DataFrame(columns=["Allele 1", "Allele 2", "Virus", "Cosine Similarity"])

#Compute the cosine similarity between each allele combination for each virus
for allele_pair in itertools.combinations(allele_names, 2) :
    for virus in viruses :
        cosine_similarity_df.loc[len(cosine_similarity_df)] = [allele_pair[0], allele_pair[1], virus, counter_cosine_similarity(Counter(significant_binders_dictionary[allele_pair[0]][virus]), Counter(significant_binders_dictionary[allele_pair[1]][virus]))]

## Functional Distance

In [34]:
grouped_df = cosine_similarity_df.groupby(["Allele 1", "Allele 2"])["Cosine Similarity"].agg("mean")
grouped_df = grouped_df.reset_index()
grouped_df["Functional Distance"] = grouped_df["Cosine Similarity"].apply(lambda x: 1 - x)
functional_distance_df = grouped_df.drop("Cosine Similarity", axis=1)

print(functional_distance_df)

#Create distance matrix (code from https://stackoverflow.com/questions/69148116/convert-long-form-dataframe-of-pairwise-distances-to-distance-matrix-in-python?noredirect=1&lq=1)
idx = sorted(set(functional_distance_df["Allele 1"]).union(functional_distance_df["Allele 2"]))
distance_matrix = (functional_distance_df.pivot(index='Allele 1', columns='Allele 2', values='Functional Distance')
   .reindex(index=idx, columns=idx)
   .fillna(0, downcast='infer')
   .pipe(lambda x: x+x.values.T)
 )

distance_matrix.to_csv("distance_matrix_05.csv")

            Allele 1            Allele 2  Functional Distance
0    Gogo_A_01_01_01        Gogo_A_02_01             0.842882
1    Gogo_A_01_01_01  Gogo_A_04_01_01_01             0.815535
2    Gogo_A_01_01_01     Gogo_A_07_01_01             0.582014
3    Gogo_A_01_01_01         HLA_A_01_01             0.974572
4    Gogo_A_01_01_01         HLA_A_02_01             0.995075
..               ...                 ...                  ...
430     Patr_A_08_01        Patr_A_01_01             0.998794
431     Patr_A_08_01        Patr_A_03_01             0.998569
432     Patr_A_08_01        Patr_A_04_01             0.976414
433     Patr_A_08_01        Patr_A_06_01             0.874827
434     Patr_A_08_01        Patr_A_07_01             0.394952

[435 rows x 3 columns]


## Functional Distances Per Virus

In [37]:
grouped_df = cosine_similarity_df.groupby(["Virus"])[["Allele 1", "Allele 2", "Cosine Similarity"]]
for virus in viruses :
    df = grouped_df.get_group(virus)
    df["Functional Distance"] = df["Cosine Similarity"].apply(lambda x: 1 - x)
    functional_distance_df = df.drop("Cosine Similarity", axis=1)
    print(functional_distance_df)
    
    #Create distance matrix (code from https://stackoverflow.com/questions/69148116/convert-long-form-dataframe-of-pairwise-distances-to-distance-matrix-in-python?noredirect=1&lq=1)
    idx = sorted(set(functional_distance_df["Allele 1"]).union(functional_distance_df["Allele 2"]))
    distance_matrix = (functional_distance_df.pivot(index='Allele 1', columns='Allele 2', values='Functional Distance')
       .reindex(index=idx, columns=idx)
       .fillna(0, downcast='infer')
       .pipe(lambda x: x+x.values.T)
     )
    distance_matrix.to_csv("distance_matrix_"+virus+".csv")

             Allele 1            Allele 2  Functional Distance
0     Gogo_A_01_01_01     Gogo_A_07_01_01             0.594632
17    Gogo_A_01_01_01        Gogo_A_02_01             0.808512
34    Gogo_A_01_01_01  Gogo_A_04_01_01_01             0.807238
51    Gogo_A_01_01_01         HLA_A_33_01             0.282215
68    Gogo_A_01_01_01         HLA_A_31_01             0.687874
...               ...                 ...                  ...
7310     Patr_A_01_01         HLA_B_07_02             1.000000
7327     Patr_A_01_01        Patr_A_06_01             0.911583
7344     Patr_A_03_01         HLA_B_07_02             1.000000
7361     Patr_A_03_01        Patr_A_06_01             0.852975
7378      HLA_B_07_02        Patr_A_06_01             0.996320

[435 rows x 3 columns]
             Allele 1            Allele 2  Functional Distance
1     Gogo_A_01_01_01     Gogo_A_07_01_01             0.552278
18    Gogo_A_01_01_01        Gogo_A_02_01             0.666609
35    Gogo_A_01_01_01  Gogo_A_0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Functional Distance"] = df["Cosine Similarity"].apply(lambda x: 1 - x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Functional Distance"] = df["Cosine Similarity"].apply(lambda x: 1 - x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Functional Distance"] = df["Cosine Similarity"].app