## Find distances of each RBP residue to either Ephrin-B2 or -B3, and find contact sites

In [None]:
# this cell is tagged as parameters for `papermill` parameterization
e2_pdb = None
e3_pdb = None
rbp_pdb = None

ephrin_b2_close_residues = None
ephrin_b3_close_residues = None

ephrin_b2_distance = None
ephrin_b3_distance = None
dimerization_distance = None

In [None]:
import math
import os

import altair as alt

import numpy as np

import pandas as pd

import scipy.stats

import Bio.SeqIO

from Bio import AlignIO
from Bio import PDB
from Bio.Align import PairwiseAligner
from Bio.PDB.DSSP import DSSP
from Bio.PDB import PDBParser

#pd.set_option('display.max_rows', None)

In [None]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
    pass
    print("Already in correct directory")
else:
    os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
    print("Setup in correct directory")

In [None]:
if ephrin_b2_close_residues is None:
    e2_pdb = 'data/custom_analyses_data/crystal_structures/2vsm.pdb'
    e3_pdb = 'data/custom_analyses_data/crystal_structures/3d12.pdb'
    rbp_pdb = 'data/custom_analyses_data/crystal_structures/7txz.pdb'
    
    ephrin_b2_close_residues = 'results/distances/2vsm_close_residues.csv'
    ephrin_b3_close_residues = 'results/distances/3d12_close_residues.csv'
    
    ephrin_b2_distance = 'results/distances/2vsm_distances.csv'
    ephrin_b3_distance = 'results/distances/3d12_distances.csv'
    dimerization_distance = 'results/distances/7txz_distances.csv'

In [None]:
#check whether output directory exists
def create_directory(directory_path):
    # Check if the directory exists
    if not os.path.exists(directory_path):
        # Create the directory
        os.makedirs(directory_path)
        print(f"Directory '{directory_path}' created.")
    else:
        print(f"Directory '{directory_path}' already exists.")

# Example usage
directory_path = 'results/distances/'
create_directory(directory_path)

## Calculate amino acid distances of RBP to Ephrins

### First calculate how many receptor residues are within cutoff distance

In [None]:
cutoff_distance = 4

In [None]:
def calculate_nearby_residues(pdb_path, source_chain_id, target_chain_ids, name, cutoff_distance):
    # Initialize the PDB parser and load the structure
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure_id', pdb_path)

    source_chain = structure[0][source_chain_id]
    target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]

    data = []

    for residueA in source_chain:
        if residueA.resname in ["HOH", "WAT", "IPA", "NAG","SO4"]:
            continue
        
        nearby_residues = []

        for target_chain in target_chains:
            for residueB in target_chain:
                if residueB.resname in ["HOH", "WAT", "IPA","SO4"]:
                    continue

                is_within_cutoff = False
                for atomA in residueA:
                    for atomB in residueB:
                        distance = atomA - atomB
                        if distance < cutoff_distance:
                            is_within_cutoff = True
                            break
                    if is_within_cutoff:
                        break
                        
                if is_within_cutoff:
                    nearby_residues.append({
                        'chain': target_chain.get_id(),
                        'residue_id': residueB.id[1],
                        'residue_name': residueB.resname,
                        'distance': distance
                    })

        data.append({
            'wildtype': residueA.resname,
            'site': residueA.id[1],
            'nearby_residues': nearby_residues,
            'custom_source': name
        })

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df

# Usage
pdb_path_2VSM = e2_pdb
source_chain_2VSM = 'A'
target_chains_2VSM = ['B']


### 3D12 is crystal structure of RBP head bound to EFNB3
pdb_path_3D12 = e3_pdb
source_chain_3D12 = 'A'
target_chains_3D12 = 'B'

df_2VSM_close = calculate_nearby_residues(pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, '2VSM_source', cutoff_distance)
df_2VSM_close['number_of_contact_residues_within_5'] = df_2VSM_close['nearby_residues'].apply(len)
df_2VSM_close['close_residues'] = df_2VSM_close['nearby_residues'].apply(lambda x: ', '.join([str(item['residue_id']) for item in x]) if x else None)


df_3D12_close = calculate_nearby_residues(pdb_path_3D12, source_chain_3D12, target_chains_3D12, '3D12_source', cutoff_distance)
df_3D12_close[f'number_of_contact_residues_within_{cutoff_distance}'] = df_3D12_close['nearby_residues'].apply(len)
df_3D12_close['close_residues'] = df_3D12_close['nearby_residues'].apply(lambda x: ', '.join([str(item['residue_id']) for item in x]) if x else None)

##Adjust numbering for 2VSM
def adjust_residues(residue_str):
    if residue_str is None:
        return None
    
    # Convert comma-separated string to list of integers
    residues = [int(residue_id) for residue_id in residue_str.split(', ')]
    
    # Adjust residue IDs
    adjusted_residues = [residue_id - 3 if residue_id < 68 else residue_id for residue_id in residues]
    
    # Convert list of integers back to comma-separated string
    return ', '.join(map(str, adjusted_residues))

df_2VSM_close['close_residues'] = df_2VSM_close['close_residues'].apply(adjust_residues)
df_2VSM_close.to_csv(ephrin_b2_close_residues,index=False)
df_3D12_close.to_csv(ephrin_b3_close_residues,index=False)

In [None]:
def calculate_min_distances(pdb_path, source_chain_id, target_chain_ids, name):
    # Initialize the PDB parser and load the structure
    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure_id', pdb_path)

    source_chain = structure[0][source_chain_id]
    target_chains = [structure[0][chain_id] for chain_id in target_chain_ids]

    data = []

    for residueA in source_chain:
        if residueA.resname in ["HOH", "WAT", "IPA", "NAG","SO4"]:
            continue

        min_distance = float('inf')
        closest_residueB = None
        closest_chain_id = None
        residues_within_4 = 0

        for target_chain in target_chains:
            for residueB in target_chain:
                if residueB.resname in ["HOH", "WAT", "IPA","SO4"]:
                    continue

                # Check for residues within 4 angstroms
                is_within_4 = False
                for atomA in residueA:
                    for atomB in residueB:
                        distance = atomA - atomB
                        if distance < min_distance:
                            min_distance = distance
                            closest_residueB = residueB
                            closest_chain_id = target_chain.get_id()
                        if distance < 4:
                            is_within_4 = True
                if is_within_4:
                    residues_within_4 += 1

        data.append({
            'wildtype': residueA.resname,
            'site': residueA.id[1],
            'chain': closest_chain_id,
            'residue': closest_residueB.id[1],
            'residue_name': closest_residueB.resname,
            'distance': min_distance,
            'residues_within_4': residues_within_4,
            'custom_source': name
        })

    # Convert data to pandas DataFrame
    df = pd.DataFrame(data)
    return df

# Usage
### 2VSM is crystal structure for RBP head bound to EFNB2
pdb_path_2VSM = e2_pdb
source_chain_2VSM = 'A'
target_chains_2VSM = 'B'


### 3D12 is crystal structure of RBP head bound to EFNB3
pdb_path_3D12 = e3_pdb
source_chain_3D12 = 'A'
target_chains_3D12 = 'B'

pdb_path_7txz = rbp_pdb
source_chain_7txz = 'B'
target_chains_7txz = 'A'

three_to_one_letter = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',
    'CYS': 'C', 'GLU': 'E', 'GLN': 'Q', 'GLY': 'G',
    'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',
    'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',
    'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
}


df_2VSM = calculate_min_distances(pdb_path_2VSM, source_chain_2VSM, target_chains_2VSM, 'E2')
df_2VSM['E2_PDB_residue'] = df_2VSM['residue_name'].replace(three_to_one_letter)
#Have to adjust the numbering because 2VSM numbering of Ephrin-B2 is off compared to Reference
df_2VSM['residue'] = np.where(df_2VSM['residue'] < 68, df_2VSM['residue'] - 3, df_2VSM['residue'])


df_3D12 = calculate_min_distances(pdb_path_3D12, source_chain_3D12, target_chains_3D12, 'E3')
df_3D12['E3_PDB_residue'] = df_3D12['residue_name'].replace(three_to_one_letter)

df_7txz = calculate_min_distances(pdb_path_7txz, source_chain_7txz, target_chains_7txz, 'dimerization')
df_7txz['dimerization_PDB_residue'] = df_7txz['residue_name'].replace(three_to_one_letter)

print("All done!")

df_2VSM.to_csv(ephrin_b2_distance,index=False)
df_3D12.to_csv(ephrin_b3_distance,index=False)
df_7txz.to_csv(dimerization_distance,index=False)

### Which residues are close to receptor?

In [None]:
def find_close_residues(df,distance_cutoff):
    close = df[df['distance'] <= distance_cutoff]
    unique_sites = close['site'].unique()
    return unique_sites

ephrin_b2_close = find_close_residues(df_2VSM,4)
print(list(ephrin_b2_close))
ephrin_b2_close = find_close_residues(df_2VSM,5)
print(list(ephrin_b2_close))

In [None]:
ephrin_b3_close = find_close_residues(df_3D12,4)
print(list(ephrin_b3_close))
ephrin_b3_close = find_close_residues(df_3D12,5)
print(list(ephrin_b3_close))

In [None]:
dimerization = find_close_residues(df_7txz,4)
print(list(dimerization))
dimerization = find_close_residues(df_7txz,5)
print(list(dimerization))

### Find overlapping sites for EFNB2 and EFNB3 that are within 5 angstroms

In [None]:
# Assuming ephrin_b2_close and ephrin_b3_close are sets or any iterable
test = list(ephrin_b2_close) + list(ephrin_b3_close)
unique_elements = set(test)

# Convert set to a list and sort
sorted_list = sorted(unique_elements)
print(sorted_list)