In [11]:
# DESTRESS data visualisation by box plot

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import numpy as np
import seaborn as sns
import pandas as pd
import csv
import os

# ---------------------------------------------------------------------------------------------------
# data import

destress_data_path = '/volumes/dax-hd/project-data/search-files/destress-data.csv'
cath_data_path = '/volumes/dax-hd/project-data/search-files/cath-domain-list-S35.csv'
image_dir = '/Users/daxtraill/Honours Project/figures/aa_abundance_architectures'

# ---------------------------------------------------------------------------------------------------
# Data-frame manipulation

df_main = pd.read_csv(destress_data_path)
df_cath = pd.read_csv(cath_data_path)

df_cath['CATH domain name'] = df_cath['CATH domain name'].str[:4]
df_cath.rename(columns={'CATH domain name': 'design_name'}, inplace=True)

df_merged = pd.merge(df_cath, df_main, on='design_name', how='left')

df_merged['Class number']
df_merged['Architecture number']


# ---------------------------------------------------------------------------------------------------
# variable data

cath_ids = {
    1: {
        10: 'orthogonal_bundle', 20: 'up_down_bundle', 25: 'alpha_horseshoe', 40: 'alpha_solenoid', 50: 'alpha_alpha_barrel'
    },
    2: {
        10: 'ribbon', 20: 'single_sheet', 30: 'beta_roll', 40: 'beta_barrel', 50: 'clam', 60: 'sandwich', 70: 'distorted_sandwich',
        80: 'trefoil', 90: 'orthogonal_prism', 100: 'aligned_prism', 102: 'three_layer_sandwich', 105: 'three_propeller',
        110: 'four_propeller', 115: 'five_propeller', 120: 'six_propeller', 130: 'seven_propeller', 140: 'eight_propeller',
        150: 'two_solenoid', 160: 'three_solenoid', 170: 'beta_complex', 180: 'shell'
    },
    3: {
        10: 'alpha_beta_roll', 15: 'super_roll', 20: 'alpha_beta_barrel', 30: 'two_layer_sandwich', 40: 'three_layer_aba_sandwich',
        50: 'three_layer_bab_sandwich', 55: 'three_layer_bba_sandwich', 60: 'four_layer_sandwich', 65: 'alpha_beta_prism', 70: 'box',
        75: 'five_stranded_propeller', 80: 'alpha_beta_horseshoe', 90: 'alpha_beta_complex', 100: 'ribosomal_protein_l15_chain_k_domain_two'
    },
    4: {
        10: 'irregular'
    },
    6: {
        10: 'helix_non_globular', 20: 'other_non_globular'
    }
}

destress_columns = [
    "composition_ALA", "composition_CYS", "composition_ASP", "composition_GLU", "composition_PHE",
    "composition_GLY", "composition_HIS", "composition_ILE", "composition_LYS",
    "composition_LEU", "composition_MET", "composition_ASN", "composition_PRO",
    "composition_GLN", "composition_ARG", "composition_SER", "composition_THR",
    "composition_VAL", "composition_TRP", "composition_UNK", "composition_TYR"
]

# ---------------------------------------------------------------------------------------------------
# Add the architecture name to df

def add_architecture_name(df):

    def get_arch_name(row):
        class_num = row['Class number']
        arch_num = row['Architecture number']
        try:
            name = f"{cath_ids[class_num][arch_num]} ({class_num},{arch_num})"
            return name
        except KeyError:
            return "Unknown"
    
    df['architecture_name'] = df.apply(get_arch_name, axis=1)
    
    return df

df_merged = add_architecture_name(df_merged)

cols = df_merged.columns.tolist()
cols.insert(1, cols.pop(cols.index('architecture_name')))
df_merged = df_merged[cols]

# ---------------------------------------------------------------------------------------------------

global_averages = df_merged[destress_columns].mean()
unique_architectures = df_merged['architecture_name'].unique()

colors = cm.viridis(np.linspace(0, 1, len(destress_columns)))
os.makedirs(image_dir, exist_ok=True)

for architecture in unique_architectures:
    df_architecture = df_merged[df_merged['architecture_name'] == architecture]

    architecture_averages = df_architecture[destress_columns].mean()
    plt.figure(figsize=(10, 8))
    bars = plt.bar(destress_columns, architecture_averages, color=colors, label=architecture)

    for i, (bar, global_avg) in enumerate(zip(destress_columns, global_averages)):
        plt.plot(i, global_avg, 'ro')
        plt.hlines(global_avg, i - 0.4, i + 0.4, colors='r', linewidth=1.5)

    shortened_labels = [label[-3:] for label in destress_columns]

    plt.xticks(range(len(destress_columns)), shortened_labels, rotation=45, ha="right")
    
    plt.title(f'Average Amino Acid Composition in {architecture}')
    plt.xlabel('Amino Acids')
    plt.ylabel('Average Composition')

    plt.tight_layout()
    image_path = os.path.join(image_dir, f"{architecture.replace(' ', '_').replace('/', '_')}.png")
    plt.savefig(image_path)
    plt.close()