In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.gridspec as gridspec
from matplotlib.colors import LinearSegmentedColormap

#Define some functions
def sequence_diversity(group):
    sequences = group['pred'].tolist()
    # Pairwise sequence diversity, given group of calculated sequences calculate pairwise similarity
    pairwise_similarity = []
    for i in sequences:
        for j in sequences:
            if i != j:
                res = sum([1 for x, z in zip(i, j) if x == z]) / len(i)
                pairwise_similarity.append(1 - res)
    if not len(pairwise_similarity):
        # This means all sequences were the same
        pairwise_similarity = 0.0
    div = sum(pairwise_similarity)/len(pairwise_similarity)
    return div
#for plotting
def pairwise_div(group):
    sequences = group['pred'].tolist()
    pairwise_similarity = []
    for i in sequences:
        for j in sequences:
            if i != j:
                res = [0 if x == z else 1 for x, z in zip(i, j)] 
                pairwise_similarity.append(res)
    pairwise_div =  [sum(x)/len(x) for x in zip(*pairwise_similarity)]
    if not len(pairwise_div):
        # This means all sequences were the same
        pairwise_div = 0.0
    return pairwise_div

def plot_sequence(sequences:list, color_values:list, model: str, name:str):
    letter_size = 30
    gap_between_letters = 0.033
    gs = gridspec.GridSpec(len(sequences)+1, 1, wspace=0, hspace=0,height_ratios=[1]*len(sequences) + [2.5])
    fig, axs = plt.subplots(nrows=len(sequences), figsize=(16,5)) 
    colors = colors = ["darkred", 'red',"#FFECEC", "#FFCCCC", "#FF6666",'white' ]
    cmap = LinearSegmentedColormap.from_list('my_cmap',  colors ) 

    for i, seq in enumerate(sequences):
        ax = plt.subplot(gs[i])
        gaps = []
        for j, aa in enumerate(seq):
            gaps.append(j * gap_between_letters)
            ax.text(j * gap_between_letters, 0, aa, ha="center", va="center", 
                        fontsize=letter_size, 
                        bbox=dict(facecolor=cmap(color_values[i][j], alpha =0.4),edgecolor="white" ))
        ax.axis('off')

    average_values = [sum(x)/len(x) for x in zip(*color_values)]
    ax2 = plt.subplot(gs[-1])
    ax2.step(range(len(average_values)), average_values , where='mid')
    ax2.set_xlim([-0.5,len(average_values)])
    ax2.set_position([0.108,0.0, 1.125, 0.3])
    ax2.set_yticks([0,0.5,0.66, 0.83, 1])
    ax2.set_xticks([])
    ax2.set_yticks([])
    ax2.set_ylabel("Diversity", fontsize=20)
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    plt.show()

In [4]:
piFold = pd.read_csv("PiFold_CATH4.2_results.csv")
RLDIF= pd.read_csv("RLDIF_Cath4.2_results.csv") 
mpnn = pd.read_csv("protein_mpnn_CATH4.2_results.csv")
columns = ["name", "pred", "real", "tm","test_type"]
mpnn = mpnn[columns].rename(columns={"ngcc_tm_scores":"tm"})
RLDIF[RLDIF["test_type"]=="test"]
piFold = piFold[piFold["test_type"]=="test"]
mpnn = mpnn[mpnn["test_type"]=="test"]
piFold["len"] = piFold["real"].str.len()
RLDIF["len"] = RLDIF["real"].str.len()
mpnn["len"] = mpnn["real"].str.len()

rldif_stats = pd.merge(RLDIF, RLDIF.groupby('name')['tm'].agg(['mean']), on ="name", how="left")
mpnn_stats = pd.merge(mpnn, mpnn.groupby('name')['tm'].agg(['mean']), on ="name", how="left")
pifold_stats = pd.merge(piFold, piFold.groupby('name')['tm'].agg(['mean']), on ="name", how="left")
rldif_div = rldif_stats.groupby('name').apply(sequence_diversity).reset_index(name="div")
rldif_pairwise_div = rldif_stats.groupby('name').apply(pairwise_div).reset_index(name="pairwise_div")
mpnn_div = mpnn_stats.groupby('name').apply(sequence_diversity).reset_index(name="div")
mpnn_pairwise_div = mpnn_stats.groupby('name').apply(pairwise_div).reset_index(name="pairwise_div")
pifold_div = pifold_stats.groupby('name').apply(sequence_diversity).reset_index(name="div")
pifold_pairwise_div = pifold_stats.groupby('name').apply(pairwise_div).reset_index(name="pairwise_div")
rldif_stats = pd.merge(rldif_stats,rldif_div, on = "name", how = "left")
rldif_stats = pd.merge(rldif_stats,rldif_pairwise_div, on = "name", how = "left")
mpnn_stats = pd.merge(mpnn_stats, mpnn_div, on = "name", how = "left")
mpnn_stats = pd.merge(mpnn_stats, mpnn_pairwise_div, on = "name", how = "left")
pifold_stats = pd.merge(pifold_stats, pifold_div, on = "name", how = "left")
pifold_stats = pd.merge(pifold_stats, pifold_pairwise_div, on = "name", how = "left")

FileNotFoundError: [Errno 2] No such file or directory: 'PiFold_CATH4.2_results.csv'

In [6]:
names = ["2x7r.B", "5flm.E"]
for name in names:
    rldif_ex = rldif_stats[rldif_stats["name"] == name]
    mpnn_ex = mpnn_stats[mpnn_stats["name"] == name]
    pifold_ex = pifold_stats[pifold_stats["name"] == name]
    rldif_ex_all_seqs, rldif_ex_all_divs = rldif_ex["pred"].tolist(), rldif_ex["pairwise_div"].tolist()
    mpnn_ex_all_seqs, mpnn_ex_all_divs = mpnn_ex["pred"].tolist(), mpnn_ex["pairwise_div"].tolist()
    pifold_ex_all_seqs, pifold_ex_all_divs = pifold_ex["pred"].tolist(), pifold_ex["pairwise_div"].tolist()
    plot_sequence(rldif_ex_all_seqs, rldif_ex_all_divs, model="rldif", name = name)
    plot_sequence(mpnn_ex_all_seqs, mpnn_ex_all_divs, model = "mpnn", name = name)
    plot_sequence(pifold_ex_all_seqs, pifold_ex_all_divs, model = "pifold", name = name)

NameError: name 'rldif_stats' is not defined