In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns
import os
import json
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# ---------------------------------------------------------------------------------------------------
# Load the dataset

path = "/Volumes/dax-hd/project-data/search-files/merged-data.csv"
base_save_folder = "/Volumes/dax-hd/project-data/images/tsne_topology_2/"
cath_dict_path = "/Volumes/dax-hd/project-data/search-files/cath-archetype-dict.txt"

df = pd.read_csv(path)
original_columns = set(df.columns)

with open(cath_dict_path, 'r') as file:
    cath_dict = json.load(file)
if not os.path.exists(base_save_folder):
    os.makedirs(base_save_folder)

# ---------------------------------------------------------------------------------------------------

selected_architectures = ["beta_barrel (2,40)"]

destress_columns = [
    "hydrophobic_fitness",
    "isoelectric_point",
    "charge",
    "mass",
    "num_residues",
    "packing_density",
    "budeff_total",
    "budeff_steric",
    "budeff_desolvation",
    "budeff_charge",
    "evoef2_total",
    "evoef2_ref_total",
    "evoef2_intraR_total",
    "evoef2_interS_total",
    "evoef2_interD_total",
    "dfire2_total",
    "rosetta_total",
    "rosetta_fa_atr",
    "rosetta_fa_rep",
    "rosetta_fa_intra_rep",
    "rosetta_fa_elec",
    "rosetta_fa_sol",
    "rosetta_lk_ball_wtd",
    "rosetta_fa_intra_sol_xover4",
    "rosetta_hbond_lr_bb",
    "rosetta_hbond_sr_bb",
    "rosetta_hbond_bb_sc",
    "rosetta_hbond_sc",
    "rosetta_dslf_fa13",
    "rosetta_rama_prepro",
    "rosetta_p_aa_pp",
    "rosetta_fa_dun",
    "rosetta_omega",
    "rosetta_pro_close",
    "rosetta_yhh_planarity",
    "aggrescan3d_total_value",
    "aggrescan3d_avg_value",
    "aggrescan3d_min_value",
    "aggrescan3d_max_value"
    ]

# ---------------------------------------------------------------------------------------------------
# Add the architecture name to df

def add_topology_description(df, cath_dict):
    def get_topology_description(row):
        class_num = str(row['Class number'])
        arch_num = str(row['Architecture number'])
        top_num = str(row['Topology number'])
        try:
            description = cath_dict[class_num][arch_num][top_num]['description']
            return description
        except KeyError:
            return "Unknown"
    
    df['topology_description'] = df.apply(get_topology_description, axis=1)
    return df

df = add_topology_description(df, cath_dict)

# ---------------------------------------------------------------------------------------------------
# Removing correlating features

def remove_highly_correlated_features(df, tolerance, columns):
    if columns is None:
        columns = df.columns

    valid_columns = [col for col in columns if col in df.columns and np.issubdtype(df[col].dtype, np.number)]
    df_selected = df[valid_columns].copy()

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_selected)
    df_scaled = pd.DataFrame(scaled_features, columns=valid_columns)

    corr_matrix = df_scaled.corr(method='spearman').abs()
    dropped_features = []

    while True:
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > tolerance)]
        
        if not to_drop:
            break
        
        feature_to_remove = to_drop[0]
        df_selected.drop(columns=feature_to_remove, inplace=True)
        df_scaled.drop(columns=feature_to_remove, inplace=True)
        dropped_features.append(feature_to_remove)
        corr_matrix = df_scaled.corr(method='spearman').abs()

    return df.drop(columns=dropped_features), dropped_features

# ---------------------------------------------------------------------------------------------------
# Remove missing data

threshold = 0.2
missing_percentage = df.isnull().sum() / len(df)
columns_to_drop = missing_percentage[missing_percentage > threshold].index
df = df.drop(columns=columns_to_drop, axis=1)
df = df.dropna()

cleaned_columns = set(df.columns)
dropped_columns = list(original_columns - cleaned_columns)
print("Dropped columns (Missing Values):", dropped_columns)

# ---------------------------------------------------------------------------------------------------
# Normalise Data

normalise_columns = [
    "num_residues", "hydrophobic_fitness", "budeff_total", "budeff_steric", "budeff_desolvation", "budeff_charge",
    "evoef2_total", "evoef2_ref_total", "evoef2_intraR_total", "evoef2_interS_total", "evoef2_interD_total",
    "dfire2_total", "rosetta_total", "rosetta_fa_atr", "rosetta_fa_rep", "rosetta_fa_intra_rep", "rosetta_fa_elec",
    "rosetta_fa_sol", "rosetta_lk_ball_wtd", "rosetta_fa_intra_sol_xover4", "rosetta_hbond_lr_bb",
    "rosetta_hbond_sr_bb", "rosetta_hbond_bb_sc", "rosetta_hbond_sc", "rosetta_dslf_fa13", "rosetta_rama_prepro",
    "rosetta_p_aa_pp", "rosetta_fa_dun", "rosetta_omega", "rosetta_pro_close", "rosetta_yhh_planarity"
]

if 'num_residues' in df.columns:
    for field in normalise_columns:
        if field in df.columns:
            df[field] = df[field] / df['num_residues']

# ---------------------------------------------------------------------------------------------------
# Drop mass and residue number, removing highly correlated features, and scaling
            
df = df[df['architecture_name'].isin(selected_architectures)]

df = df.drop(['mass', 'num_residues'], axis=1)

df, dropped_features = remove_highly_correlated_features(df, tolerance=0.6, columns=destress_columns)

corr_columns = set(df.columns)
dropped_columns_corr = list(cleaned_columns - corr_columns)
print("Dropped columns (Correlation):", dropped_columns_corr)

nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df = df.drop(cols_to_drop, axis=1)

nuq_columns = set(df.columns)
dropped_columns_nuq = list(corr_columns - nuq_columns)
print("Dropped columns (Little/no Variance):", dropped_columns_nuq)

tsne_columns = [col for col in destress_columns if col in df.columns]
df_tsne_ready = df[tsne_columns].dropna()

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_tsne_ready)

# ---------------------------------------------------------------------------------------------------
# Plotting

for architecture_name in selected_architectures:
    save_folder = os.path.join(base_save_folder, architecture_name.replace('/', '_'))
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    tsne = TSNE(n_components=2, perplexity=20, learning_rate=800, n_iter=3000, random_state=42)
    tsne_results = tsne.fit_transform(df_scaled)
    tsne_df = pd.DataFrame(data=tsne_results, columns=['Dimension 1', 'Dimension 2'])
    
    tsne_df['topology_description'] = df['topology_description'].values[:len(tsne_df)]
    print(f"Total number of datapoints for {architecture_name}: {len(tsne_df)}")

    # ---------------------------------------------------------------------------------------------------
    
    tsne_df['topology_description'] = df['topology_description'].values
    unique_topologies = tsne_df['topology_description'].unique()
    topology_to_id = {topology: i % 3 for i, topology in enumerate(unique_topologies)}

    tsne_df['marker_style'] = tsne_df['topology_description'].map(topology_to_id).map({0: 'o', 1: '^', 2: 's'})

    markers = ['o', '^', 's']
    palette = sns.color_palette('Spectral', n_colors=len(unique_topologies))
    
    plt.figure(figsize=(20, 12))
    ax = plt.subplot(111, aspect='equal')

    sns.scatterplot(
        x='Dimension 1', y='Dimension 2',
        hue='topology_description',
        style='marker_style',
        palette=palette,
        markers=markers,
        data=tsne_df, s=100,
        ax=ax
    )
    
    plt.title(f't-SNE for {architecture_name}')
    plt.xlabel('Dimension 1')
    plt.ylabel('Dimension 2')

    ax.legend_.remove()
    legend_items = []
    for i, (topology, marker) in enumerate(zip(unique_topologies, markers * (len(unique_topologies) // len(markers) + 1))):
        legend_items.append(mlines.Line2D([], [], color=palette[i % len(palette)], marker=marker, linestyle='None', markersize=10, label=topology))
    ax.legend(handles=legend_items, title='Topology Description', bbox_to_anchor=(1, 1), loc='upper left')

    plt.figtext(0.5, 0.03, f"Perplexity: {tsne.perplexity}, Learning Rate: {tsne.learning_rate}, Iterations: {tsne.n_iter}", ha="center", fontsize=10)
    plt.figtext(0.5, 0.01, f"Features used: {tsne_columns}", ha="center", fontsize=10)
        
    plt.savefig(os.path.join(save_folder, f"{architecture_name}-tsne.png"), bbox_inches='tight')
    plt.close()

print("TSNE analysis completed.")

# ---------------------------------------------------------------------------------------------------


Dropped columns (Missing Values): ['dfire2_total']
Dropped columns (Correlation): ['num_residues', 'rosetta_rama_prepro', 'budeff_charge', 'evoef2_interD_total', 'rosetta_lk_ball_wtd', 'aggrescan3d_total_value', 'rosetta_fa_intra_sol_xover4', 'rosetta_fa_sol', 'budeff_steric', 'mass', 'evoef2_intraR_total', 'rosetta_fa_rep', 'evoef2_ref_total', 'rosetta_pro_close', 'rosetta_fa_atr', 'rosetta_p_aa_pp', 'rosetta_hbond_sr_bb', 'rosetta_hbond_lr_bb', 'rosetta_hbond_bb_sc', 'charge', 'evoef2_total', 'rosetta_fa_intra_rep', 'evoef2_interS_total', 'budeff_desolvation', 'rosetta_fa_elec', 'rosetta_fa_dun', 'rosetta_hbond_sc']
Dropped columns (Little/no Variance): ['Class number', 'S100 sequence cluster number', 'S100 sequence count number', 'Architecture number', 'composition_UNK', 'S60 sequence cluster number', 'architecture_name', 'S95 sequence cluster number', 'rosetta_yhh_planarity']
Total number of datapoints for beta_barrel (2,40): 1100
TSNE analysis completed.


In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns
import os
import json
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# ---------------------------------------------------------------------------------------------------
# Load the dataset

path = "/Volumes/dax-hd/project-data/corr_features/destress_columns_reduced.csv "
base_save_folder = "/Volumes/dax-hd/project-data/images/tsne_topology_"
cath_dict_path = "/Volumes/dax-hd/project-data/search-files/cath-archetype-dict.txt"

df = pd.read_csv(path)

with open(cath_dict_path, 'r') as file:
    cath_dict = json.load(file)

# ---------------------------------------------------------------------------------------------------
# Add the architecture name to df

def add_topology_description(df, cath_dict):
    def get_topology_description(row):
        class_num = str(row['Class number'])
        arch_num = str(row['Architecture number'])
        top_num = str(row['Topology number'])
        try:
            description = cath_dict[class_num][arch_num][top_num]['description']
            return description
        except KeyError:
            return "Unknown"
    
    df['topology_description'] = df.apply(get_topology_description, axis=1)
    return df

df = add_topology_description(df, cath_dict)

# ---------------------------------------------------------------------------------------------------

def filter_for_archetypes(df, cath_dict):
    archetype_ids = []
    for _, row in df.iterrows():
        class_num = str(row['Class number'])
        arch_num = str(row['Architecture number'])
        top_num = str(row['Topology number'])
        try:
            protein_id = cath_dict[class_num][arch_num][top_num]['protein_id']
            if protein_id[:4] in row['design_name']:
                archetype_ids.append(row['design_name'])
        except KeyError:
            continue
    return df[df['design_name'].isin(archetype_ids)]

# ---------------------------------------------------------------------------------------------------

selected_architectures = ["sandwich (2,60)"]

selected_columns = [
    "hydrophobic_fitness",
    "isoelectric_point",
    "charge",
    "mass",
    "num_residues",
    "packing_density",
    "budeff_total",
    "budeff_steric",
    "budeff_desolvation",
    "budeff_charge",
    "evoef2_total",
    "evoef2_ref_total",
    "evoef2_intraR_total",
    "evoef2_interS_total",
    "evoef2_interD_total",
    "dfire2_total",
    "rosetta_total",
    "rosetta_fa_atr",
    "rosetta_fa_rep",
    "rosetta_fa_intra_rep",
    "rosetta_fa_elec",
    "rosetta_fa_sol",
    "rosetta_lk_ball_wtd",
    "rosetta_fa_intra_sol_xover4",
    "rosetta_hbond_lr_bb",
    "rosetta_hbond_sr_bb",
    "rosetta_hbond_bb_sc",
    "rosetta_hbond_sc",
    "rosetta_dslf_fa13",
    "rosetta_rama_prepro",
    "rosetta_p_aa_pp",
    "rosetta_fa_dun",
    "rosetta_omega",
    "rosetta_pro_close",
    "rosetta_yhh_planarity",
    "aggrescan3d_total_value",
    "aggrescan3d_avg_value",
    "aggrescan3d_min_value",
    "aggrescan3d_max_value"
]

# ---------------------------------------------------------------------------------------------------

df_filtered = df #filter_for_archetypes(df, cath_dict)
df_selected_cleaned = df_filtered[selected_columns].dropna()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_selected_cleaned)

# ---------------------------------------------------------------------------------------------------

for architecture_name in df['architecture_name'].unique():
    if architecture_name not in selected_architectures:
        continue

    df_architecture = df[df['architecture_name'] == architecture_name]
    df_selected_cleaned = df_architecture[selected_columns].dropna()

    if df_selected_cleaned.empty:
        print(f"No data available for {architecture_name}. Skipping...")
        continue

    topology_descriptions = df_architecture['topology_description'][df_selected_cleaned.index].reset_index(drop=True)

    # Scaling
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(df_selected_cleaned)

    tsne = TSNE(n_components=3, random_state=42)
    tsne_results = tsne.fit_transform(scaled_features)
    
    tsne_df = pd.DataFrame(tsne_results, columns=['Dimension 1', 'Dimension 2', 'Dimension 3'])
    tsne_df['topology_description'] = topology_descriptions

    # 3D Plotting
    fig = plt.figure(figsize=(30, 21))
    ax = fig.add_subplot(111, projection='3d')

    codes, uniques = pd.factorize(tsne_df['topology_description'])

    num_colors = len(uniques)

    repeated_cmap = plt.cm.tab20c(np.linspace(0, 1, num_colors) % 1)

    scatter = ax.scatter(
        tsne_df['Dimension 1'], tsne_df['Dimension 2'], tsne_df['Dimension 3'],
        c=[repeated_cmap[i] for i in codes],
        s=100
    )

    legend_handles = [mpatches.Patch(color=repeated_cmap[i], label=uniques[i]) for i in range(num_colors)]

    ax.legend(handles=legend_handles, bbox_to_anchor=(1.05, 1), loc='upper left', title="Topologies")

    ax.set_title(f't-SNE 3D Visualization')
    ax.set_xlabel('Dimension 1')
    ax.set_ylabel('Dimension 2')
    ax.set_zlabel('Dimension 3')

    figure_path = os.path.join(base_save_folder, f"tsne_3d_{architecture_name}.png")
    plt.savefig(figure_path)
    plt.close(fig)



ValueError: could not convert string to float: '1oai'