In [5]:
#generate csv files with lables and paths to npy files of the adj matricies
from prody import *
from biopandas.pdb import PandasPdb
import numpy as np
from scipy.spatial import distance_matrix
import matplotlib.pyplot as plt
import os
import pandas as pd
import csv 
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split

In [6]:
def distance_matrix_creator(PDB_filename):
    """"This function takes in a PDB and ... it returns a symetrix MxM matrix that is rotation and shift independant"""""
    ppdb = PandasPdb()
    data = ppdb.read_pdb(PDB_filename)
    atom_data = ppdb.df['ATOM']    
    mut_removed = atom_data
    # print(len(mut_removed["residue_number"])-len(atom_data["residue_number"])) #different for alpha...? much smaller number of atoms?...
    # print(mut_removed["x_coord",])
    position_matrix = mut_removed[["residue_number","x_coord" , "y_coord" , "z_coord"]]
    # aggresgate and take mean of xyz values for each residue as an approximation.
    aggregation_functions = {'x_coord': 'mean', 'y_coord': 'mean', 'z_coord': 'mean'}
    position_matrix = position_matrix.groupby(position_matrix['residue_number']).aggregate(aggregation_functions)  
    # cartersian productcartersian product of distance. 
    dist_mat = distance_matrix(position_matrix,position_matrix,p=2) #p=2 for euclidian disntace

    return dist_mat 

In [7]:
variant_scores_path = "/mnt/ncshare/ozkilim/charge_pca_deepmut/SARS-CoV-2-RBD_DMS_Omicron/results/final_variant_scores/final_variant_scores.csv"

variants_names = {
    "Wuhan-Hu-1_v2": "Wuhan-Hu-1_v2",
    "N501Y": "Alpha", 
    "Beta": "Beta",
    "Delta": "Delta",
    "E484K": "Eta",
    "Omicron_BA1":"Omicron_BA1"
}
# Expression and Binding scores per variant background -- for the heatmap plots
scores_df = (
    pd.read_csv(variant_scores_path)
        .rename(columns = {"position":"site",
                           "delta_expr":"RBD expression", # rename for the selection menus 
                           "delta_bind":"ACE2 binding"    # rename for the selection menus 
                          })
        .replace({"target":variants_names})
)
wuhan = scores_df.loc[scores_df["target"] == "Wuhan-Hu-1_v2"]    
alpha = scores_df.loc[scores_df["target"] == "Alpha"]    
beta = scores_df.loc[scores_df["target"] == "Beta"]    
delta = scores_df.loc[scores_df["target"] == "Delta"]    
eta = scores_df.loc[scores_df["target"] == "Eta"]    
omicronBA1 = scores_df.loc[scores_df["target"] == "Omicron_BA1"]    
omicronBA2 = scores_df.loc[scores_df["target"] == "Omicron_BA2"]    

In [8]:
# make genereic function for all the variants.

def create_frame_for_CNN(variant_name, variant_lab_data, directory):
    '''Create large df for path to adj mat as well as save new dir of adj mats for a given varient'''
    # that directory
    for idx, filename in enumerate(os.listdir(directory)):
        try:
            file_path = os.path.join(directory, filename)
            mut_name = file_path[-9:-4]
            # create distance matrix.
            d = distance_matrix_creator(file_path)
            # get phenotype expresion value. 
            expression = variant_lab_data.loc[variant_lab_data["mutation"]== mut_name]["RBD expression"].values
            binding = variant_lab_data.loc[variant_lab_data["mutation"]== mut_name]["ACE2 binding"].values  
            # save npy mat to a file and create row for csv annotations file to be read for resnet.
            numpy_file = "/mnt/ncshare/ozkilim/charge_pca_deepmut/CNN_training/adj_mats/" + variant_name + "/" + mut_name + ".npy" 
            np.save(numpy_file,d)
            row = [numpy_file,binding[0],expression[0]] #This is generated the other way to the XGBoost data, keep in mind. 
            # Append row to csv file for eading later.
            with open('/mnt/ncshare/ozkilim/charge_pca_deepmut/CNN_training/' + variant_name + '_adj_data.csv', 'a') as f:
                writer = csv.writer(f)
                writer.writerow(row)
        except:
            pass


directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Wuhan_RBDs/'
variant_name = "wuhan"
create_frame_for_CNN(variant_name, wuhan, directory)

directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Alpha_aligned/'
variant_name = "alpha"
create_frame_for_CNN(variant_name, alpha, directory)

directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Beta_aligned/'
variant_name = "beta"
create_frame_for_CNN(variant_name, beta, directory)

directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Delta_aligned/'
variant_name = "delta"
create_frame_for_CNN(variant_name, delta, directory)

directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Eta_Wuhan_RBD_DMS_PDB/'
variant_name = "eta"
create_frame_for_CNN(variant_name, eta, directory)

directory = '/mnt/ncshare/ozkilim/charge_pca_deepmut/Wuhan_RBDs/structures/Omicron_Wuhan_RBD_DMS_PDB/'
variant_name = "omicronBA1"
create_frame_for_CNN(variant_name, omicronBA1, directory)