In [62]:
import pandas as pd
from biopandas.pdb import PandasPdb
from prody import parsePDBHeader
from typing import Optional

import numpy as np
from PNDS_PNS_alt import pns_loop
from PNDS_geometry_alt import RESHify_1D, unRESHify_1D
from PNDS_io import export_csv
from plot_functions import scatter_plots
from matplotlib import pyplot as plt

In [4]:
def read_pdb_to_dataframe(
    pdb_path: Optional[str] = None,
    model_index: int = 1,
    parse_header: bool = True,
    ) -> pd.DataFrame:
    """
    Read a PDB file, and return a Pandas DataFrame containing the atomic coordinates and metadata.

    Args:
        pdb_path (str, optional): Path to a local PDB file to read. Defaults to None.
        model_index (int, optional): Index of the model to extract from the PDB file, in case
            it contains multiple models. Defaults to 1.
        parse_header (bool, optional): Whether to parse the PDB header and extract metadata.
            Defaults to True.

    Returns:
        pd.DataFrame: A DataFrame containing the atomic coordinates and metadata, with one row
            per atom
    """
    atomic_df = PandasPdb().read_pdb(pdb_path)
    if parse_header:
        header = parsePDBHeader(pdb_path)
    else:
        header = None
    atomic_df = atomic_df.get_model(model_index)
    if len(atomic_df.df["ATOM"]) == 0:
        raise ValueError(f"No model found for index: {model_index}")

    return pd.concat([atomic_df.df["ATOM"], atomic_df.df["HETATM"]]), header

In [6]:
df, df_header = read_pdb_to_dataframe('/Users/kaisardauletbek/Documents/rnaprecis/clean_mintage_code/rna2020_pruned_pdbs/2jLv_C_FH_pruned.pdb')

In [187]:
df.loc[df['atom_name']=='P', ['x_coord', 'y_coord', 'z_coord']]

Unnamed: 0,x_coord,y_coord,z_coord
0,17.417,10.088,25.249
34,17.739,4.364,25.408
67,15.946,-1.16,23.544


In [17]:
import plotly.express as px

fig = px.scatter_3d(df, x='x_coord', y='y_coord', z='z_coord', color='element_symbol')
fig.update_traces(marker_size = 4)

fig.show()

In [185]:
from graphein.protein.graphs import label_node_id

def process_dataframe(df: pd.DataFrame, granularity='CA') -> pd.DataFrame:
    """
    Process a DataFrame of protein structure data to reduce ambiguity and simplify analysis.

    This function performs the following steps:
    1. Handles alternate locations for an atom, defaulting to keep the first one if multiple exist.
    2. Assigns a unique node_id to each residue in the DataFrame, using a helper function label_node_id.
    3. Filters the DataFrame based on specified granularity (defaults to 'CA' for alpha carbon).

    Parameters
    ----------
    df : pd.DataFrame
        The DataFrame containing protein structure data to process. It is expected to contain columns 'alt_loc' and 'atom_name'.
        
    granularity : str, optional
        The level of detail or perspective at which the DataFrame should be analyzed. Defaults to 'CA' (alpha carbon).
    """
    # handle the case of alternative locations,
    # if so default to the 1st one = A
    if 'alt_loc' in df.columns:
      df['alt_loc'] = df['alt_loc'].replace('', 'A')
      df = df.loc[(df['alt_loc']=='A')]
    df = label_node_id(df, granularity)
    df = df.loc[(df['atom_name']==granularity)]
    return df

In [191]:
label_node_id(df, granularity='CA')

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx,model_id,node_id,residue_id
0,ATOM,32,,P,A,G,,C,2,,...,1.0,23.09,,,P,,4,1,C:G:2:A,C:G:2:A
1,ATOM,33,,OP1,A,G,,C,2,,...,1.0,23.37,,,O,,5,1,C:G:2:A,C:G:2:A
2,ATOM,34,,OP2,A,G,,C,2,,...,1.0,21.92,,,O,,6,1,C:G:2:A,C:G:2:A
3,ATOM,35,,O5',A,G,,C,2,,...,1.0,22.52,,,O,,7,1,C:G:2:A,C:G:2:A
4,ATOM,36,,C5',A,G,,C,2,,...,1.0,21.48,,,C,,8,1,C:G:2:A,C:G:2:A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,ATOM,125,,H1',A,C,,C,4,,...,1.0,21.33,,,H,,97,1,C:C:4:A,C:C:4:A
94,ATOM,126,,H41,A,C,,C,4,,...,1.0,23.31,,,H,,98,1,C:C:4:A,C:C:4:A
95,ATOM,127,,H42,A,C,,C,4,,...,1.0,23.31,,,H,,99,1,C:C:4:A,C:C:4:A
96,ATOM,128,,H5,A,C,,C,4,,...,1.0,21.63,,,H,,100,1,C:C:4:A,C:C:4:A


In [192]:
df['atom_name'].unique()

array(['P', 'OP1', 'OP2', "O5'", "C5'", "C4'", "O4'", "C3'", "O3'", "C2'",
       "O2'", "C1'", 'N9', 'C8', 'N7', 'C5', 'C6', 'O6', 'N1', 'C2', 'N2',
       'N3', 'C4', "H5'", "H5''", "H4'", "H3'", "H2'", "HO2'", "H1'",
       'H8', 'H1', 'H21', 'H22', 'N6', 'H61', 'H62', 'H2', 'O2', 'N4',
       'H41', 'H42', 'H5', 'H6'], dtype=object)

In [186]:
process_df = process_dataframe(df)


In [190]:
df['alt_loc'].unique()

array(['A'], dtype=object)

In [181]:
process_df

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx,model_id,node_id,residue_id


In [48]:
def scatter_plots(input_data, filename, axis_min=None, axis_max=None, set_title=None, number_of_elements=None):
    if input_data.ndim == 1:
        # If input is 1D, reshape it to 2D
        input_data = input_data.reshape(1, -1)
    
    n = input_data.shape[1]
    if n < 2:
        print(f"Warning: Cannot create scatter plot for data with {n} dimensions.")
        return

    if number_of_elements is None:
        number_of_elements = input_data.shape[0]

    fig = plt.figure(figsize=(20, 20))
    fig.suptitle(set_title, fontsize=16)

In [168]:
np.random.seed(42)
data = np.random.rand(100, 3) * 360  # 100 points, 7 dimensions

# Prepare data for PNS
sphere_points, means, half = RESHify_1D(data, False)

# Run PNS
spheres, projected_points, distances = pns_loop(sphere_points, 10, 10, degenerate=False, verbose=True, half=half) #, mode='torus'


Dimension: 4 Mode: None
Starting fit... 
done! Exit code: 1 
 [ 0.12007205  0.12141639 -0.94029535 -0.29442386]
Starting fit... 
done! Exit code: 1 
 [ 0.12007203  0.12141628 -0.94029485 -0.29442553]
Starting fit... 
done! Exit code: 1 
 [ 0.12007538  0.12140699 -0.94022458 -0.29465232]
Starting fit... 
done! Exit code: 1 
 [-0.12007178 -0.12141702  0.94030024  0.29440811]
Starting fit... 
done! Exit code: 1 
 [ 0.12007163  0.12141731 -0.94030287 -0.29439966]
Starting fit... 
done! Exit code: 1 
 [ 0.12007584  0.12140573 -0.94021498 -0.29468329]
Dimension: 3 Mode: None
Starting fit... 
done! Exit code: 1 
 [ 0.08462291 -0.89745404  0.43291478]
Starting fit... 
done! Exit code: 1 
 [ 0.08461962 -0.89745452  0.43291443]
Starting fit... 
done! Exit code: 1 
 [ 0.08460746 -0.8974563   0.43291312]
Starting fit... 
done! Exit code: 1 
 [-0.08437735  0.89749    -0.43288817]
Starting fit... 
done! Exit code: 1 
 [ 0.08439689 -0.89748714  0.43289028]


In [172]:
means

array([[  1.        , 320.06989004,  15.33592451],
       [  2.        , 341.87808848,  16.23856915],
       [  0.        ,  90.74519845,  16.60523592]])

In [169]:
# Print some results
print("Original data shape:", data.shape)
print("Projected points shapes:")
for i, points in enumerate(projected_points):
    print(f"  Level {i}: {points.shape}")


Original data shape: (100, 3)
Projected points shapes:
  Level 0: (100, 3)
  Level 1: (100, 2)
  Level 2: (2,)


In [175]:
from math import sqrt, radians, degrees
PI = np.pi
RAD = radians(1)
DEG = degrees(1)
EPS = 1e-10

def angle_shift (angles):
    shift = np.zeros(len(angles))
    for line in angles:
        shift[int(line[0])] = line[1]
    #print((shift+ 180e6).astype(int))
    return shift

def unRESHify_1D (data, angles, half):
    tmp = data.copy()
    angle_tmp = np.zeros(data.shape)
    n = data.shape[1]-1
    for i in range(n):
        for j in range(i):
            tmp[:,i] /= np.sin(angle_tmp[:,j]).clip(EPS, 1)
        angle_tmp[:,i] = np.arccos(tmp[:,i].clip(-1,1))
    for j in range(n-1):
        tmp[:,-1] /= np.sin(angle_tmp[:,j]).clip(EPS, 1)
    angle_tmp[:,-2] = (2 * PI + np.arctan2(tmp[:,-1], tmp[:,-2])) % (2 * PI)
    angle_tmp = angle_tmp[:,:-1] * DEG + 270
    angle_tmp[:,:-1] *= (2 if half else 1)
    angle_data = np.zeros(angle_tmp.shape)
    for i in range(n):
        angle_data[:,int(angles[2][0])] = angle_tmp[:,i]
    angle_data = (angle_data + angle_shift(angles)) % 360
    return angle_data

In [176]:
unfolded_data = unRESHify_1D(projected_points[-2], means, half)
print("Unfolded data shape:", unfolded_data.shape)

Unfolded data shape: (100, 3)


In [178]:
# Plot the results
scatter_plots(data, filename='original_data', axis_min=0, axis_max=360, 
            set_title='Original Data', number_of_elements=None)
scatter_plots(unfolded_data, filename='unfolded_data', axis_min=0, axis_max=360, 
            set_title='Unfolded Data', number_of_elements=None)

In [166]:
# Plot the final projection
final_projection = projected_points[-1]
if final_projection.ndim == 1:
    final_projection = final_projection.reshape(1, -1)
scatter_plots(final_projection, filename='final_projection', 
            set_title='Final Projection', number_of_elements=None)

In [167]:
final_projection

array([[-0.10136114,  0.9948497 ]])