In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def ATOM_pdb_line(atomid, x, y, z, b):
    """
    atomid should already be a string-formatted integer < 10000
    x, y, z should already be in the format of :.3f
    b should already be in the format of :.2f 
    """
    end_atomid = 11
    end_x = 38
    end_y = 46
    end_z = 54
    end_b = 66

    out = [" "]*80 
    out.append("\n")
    out[0:3] = "ATOM"
    out[end_atomid - len(atomid): end_atomid] = atomid
    out[end_x - len(x): end_x] = x
    out[end_y - len(y): end_y] = y
    out[end_z - len(z): end_z] = z
    out[end_b - len(b): end_b] = b
    
    return ''.join(out) 

def CONECT_pdb_line(starting_atomid, connected_atomids):
    """
    starting_atomid is the id of the atom from which connections start
    connected atomids is a list of at most 4 atomid of atoms connected to the starting atom

    it is allowed to have multiple CONECT lines for the same atom, if that atom is connected to more than 4 atoms
    """
    end_starting_atomid = 11
    end_connected_atomids = [16, 21, 26, 31]

    out = [" "]*80 
    out.append("\n")
    out[0:5] = "CONECT"
    out[end_starting_atomid - len(starting_atomid): end_starting_atomid] = starting_atomid
    for end, idi in zip(end_connected_atomids, connected_atomids):
        out[end - len(idi): end] = idi
    
    return ''.join(out) 

def CA_coordinates(pdb_path):
    """
    Note that in the case that i have more than one CA in the same residue, then the first CA is considered
    """
    
    rexp = re.compile('^ATOM[ 0-9]{9}CA[ 0-9a-zA-Z]{8}([ 0-9]{4})[ 0-9a-zA-Z]{4}([-. 0-9]{8})([-. 0-9]{8})([-. 0-9]{8})')
    
    CA_coords = []
    
    with open(pdb_path, "r") as file:
        current_res = None
        for l in file.readlines(): 
            m = rexp.match(l) 
            if m:
                if float(m.group(1)) != current_res:
                    CA_coords.append([float(m.group(1)), float(m.group(2)), float(m.group(3)), float(m.group(4))])
                    current_res = float(m.group(1))
                
    return CA_coords

def get_connections(matrix_path, treshold = None): 
    matrix = np.loadtxt(matrix_path, delimiter = " ")
    nresids = np.shape(matrix)[0]
    conn = []

    if treshold == None:
        treshold = 0.5
    
    for i in range(nresids):
        single_res_connections = []
        for j in range(nresids):
            if np.abs(matrix[i, j]) >= treshold:
                single_res_connections.append(j)
        conn.append(single_res_connections)   
    return conn

In [3]:
path = "/home/giacomo/comp_bio/cbp_final/data/03_analyzed/graphs/ccs.pdb"
matrix_path = "/home/giacomo/comp_bio/cbp_final/data/03_analyzed/graphs/macro_IIN_unweighted.dat"
matrix_w_path = "/home/giacomo/comp_bio/cbp_final/data/03_analyzed/graphs/macro_IIN_weighted.dat"
path_pdb = "/home/giacomo/comp_bio/cbp_final/data/00_external/pdb_3EIG_prot.pdb"

path_graph_pdb = "/home/giacomo/comp_bio/cbp_final/data/03_analyzed/graphs/manual_graph_giacomo."

In [4]:
# get coordinater for each residue
CA_coords = CA_coordinates(path_pdb)

# Using the weighted matrix

In [5]:
conn_w = get_connections(matrix_w_path, 0.8)

In [6]:
print(conn_w)

[[], [], [], [96, 111, 113], [], [132], [33, 37], [8, 132], [7, 137], [13], [], [], [], [9], [], [], [], [], [], [], [], [], [], [141], [], [], [137, 176], [], [], [], [], [], [164], [6, 114], [], [], [133], [6, 112], [], [], [], [], [], [], [], [], [], [], [70, 72], [66], [74, 92, 111], [56], [], [78], [], [], [51, 64], [], [], [60], [59], [], [], [], [56], [71], [49], [], [], [], [48], [65], [48], [], [50, 119], [78], [], [], [53, 75, 88], [], [], [], [], [], [], [], [], [104], [78], [], [], [], [50, 113], [], [], [], [3], [], [], [108], [], [], [], [], [87], [], [], [], [99], [], [], [3, 50], [37], [3, 92, 120], [33], [], [], [], [], [74], [113], [], [123], [122], [132], [], [], [], [], [], [], [161], [5, 7, 124, 134], [36], [132], [137], [181], [8, 26, 135, 141, 174], [177], [], [], [23, 137, 174], [], [], [], [], [], [], [], [], [], [], [], [], [], [183], [], [181], [161], [], [], [131, 158], [], [], [32, 180], [169], [], [], [], [165], [], [], [], [], [137, 141], [], [26], [138],

In [7]:
with open(path_graph_pdb, "w") as file: 
    # first one line for each residue with at least 2 connections and the beta factor as the number of connections
    for coords, con in zip(CA_coords, conn_w): 
        #if len(con) >= 2:
        line = ATOM_pdb_line(f"{int(coords[0]):d}", f"{coords[1]:.3f}", f"{coords[2]:.3f}", f"{coords[3]:.3f}", f"{len(con):.2f}")
        file.write(line)
    # then connection lines
    for i, con in enumerate(conn_w):
        #if len(con) >= 2:
        tocon = [f"{j}" for j in con]
        line = CONECT_pdb_line(f"{i+1}", tocon)
        file.write(line)
    # in the end END
    file.write("END")