# Imports

In [1]:
from processing.processor import *
from analysis.analysis import *
import numpy as np
from scipy.spatial.distance import cdist, pdist
import seaborn as sns

# Load Data

In [2]:
p = CifProcessor()
p.read_pkl_metainfo()
p.read_pkl(mode='rg', folder='data/processed/')

  2%|██▍                                                                                                               | 12/557 [00:00<00:04, 114.80it/s]

Reading files with generic numbers on receptors and gproteins.


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 557/557 [00:06<00:00, 91.98it/s]


# Filter Data (create different filters)

In [3]:
f_gio = p.make_filter(State='Active', Cl='Rhodopsin', Family='Gi/o', gprotein=True)
fuf = f_gio[f_gio['PDB']=='6FUF']
f_gio = f_gio[f_gio['Method']!='X-ray']
# Combine with 6FUF
f_gio = f_gio.append(fuf).reset_index(drop=True)
print("Number of samples in complex with G_i/o:", len(f_gio))

Number of samples in complex with G_i/o: 35


In [4]:
f_gs = p.make_filter(State='Active', Cl='Rhodopsin', Family='Gs', gprotein=True)
print("Number of samples in complex with G_s:", len(f_gs))

Number of samples in complex with G_s: 33


In [5]:
f_q11 = p.make_filter(State='Active', Cl='Rhodopsin', Family='Gq/11', gprotein=True)
print("Number of samples in complex with G_qq/11:", len(f_q11))

Number of samples in complex with G_qq/11: 4


In [6]:
f_tot = pd.concat([f_gio, f_gs, f_q11]).reset_index(drop=True)

In [7]:
f_tot

Unnamed: 0,uniprot(gene),receptor family,Cl.,Species,Method,PDB,Resolution,Preferred Chain,State,Degree active %,Family,Subtype,Function
0,5HT1B,5-Hydroxytryptamine,A(Rhodopsin),Human,cryo-EM,6G79,3.8,S,Active,100,Gi/o,αo,Agonist
1,DRD2,Dopamine,A(Rhodopsin),Human,cryo-EM,7JVR,2.8,R,Active,100,Gi/o,αi1,Agonist
2,ADA2B,Adrenoceptors,A(Rhodopsin),Human,cryo-EM,6K42,4.1,R,Active,100,Gi/o,αi1,Agonist
3,ADA2B,Adrenoceptors,A(Rhodopsin),Human,cryo-EM,6K41,2.9,R,Active,100,Gi/o,αo,Agonist
4,NTR1,Neurotensin,A(Rhodopsin),Human,cryo-EM,6OS9,3.0,R,Active,100,Gi/o,αi1,Agonist
...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,CCKAR,Cholecystokinin,A(Rhodopsin),Human,cryo-EM,7MBX,2.0,R,Active,100,Gs,αs,Agonist
68,ACM1,Acetylcholine(muscarinic),A(Rhodopsin),Human,cryo-EM,6OIJ,3.3,R,Active,100,Gq/11,α11,Agonist
69,5HT2A,5-Hydroxytryptamine,A(Rhodopsin),Human,cryo-EM,6WHA,3.4,A,Active,100,Gq/11,αq,Agonist
70,HRH1,Histamine,A(Rhodopsin),Human,cryo-EM,7DFL,3.3,R,Active,100,Gq/11,αq,Agonist


In [8]:
p.apply_filter(f_tot)

In [9]:
filtered_indices_q11 = [x for x in p.get_dfl_indices(list(f_q11['PDB'])) if x != None]
filtered_indices_gs = [x for x in p.get_dfl_indices(list(f_gs['PDB'])) if x != None]
filtered_indices_gio = [x for x in p.get_dfl_indices(list(f_gio['PDB'])) if x != None]

In [10]:
print("Total number of valid samples:", len(filtered_indices_gio)+len(filtered_indices_gs)+len(filtered_indices_q11))

Total number of valid samples: 72


In [11]:
l = [filtered_indices_gio, filtered_indices_gs, filtered_indices_q11]

# StdAnalysis (class)

In [12]:
class StdAnalysis:
    def __init__(self,
                 P: CifProcessor
                 ):
        self.P = P
        self.queries = pd.DataFrame(columns=['mode', 'query_tag', 'pdb_id', 'poi', 'dist_poi'])
        self.dist_df_dict = {}
        self.angles_df_dict = {}
        self.helical_angles_mean = None
        self.helical_angles_std = None
        self.gs_count_df = None
        
    
    def run_dist_analysis(self, l, query_tag='', poi=('G.H5.25', 7.51), start=None, end=None, eps=0.05):
        if query_tag in list(self.queries['query_tag'].unique()):
            query_tag += str()
        list_poi_list, list_dists_df_list = get_interaction_tables(p=self.P, l=l, section='H5', poi=poi, start=start, 
                                                                   end=end, eps=eps)
        starting_point = 0
        for i, poi_list in enumerate(list_poi_list):
            data = ['dist', query_tag, poi_list[i][0], poi, poi_list[0][2]]
            df = pd.DataFrame(data).T
            df = df.set_axis(['mode', 'query_tag', 'pdb_id', 'poi', 'dist_poi'], axis=1)
            self.queries = self.queries.append(df, ignore_index=True)
        
        flat_lddl = []
        for l in list_dists_df_list:
            for ll in l:
                flat_lddl.append(ll)
                
        for j, ldd in enumerate(flat_lddl):
            pdb_id = ldd[0]
            table = ldd[2]
            self.dist_df_dict.update({pdb_id: table})
        
        
    def run_helical_analysis(self):
        section_helices = calculate_section_helices(self.P.dfl)
        rois = list(section_helices.columns)
        list_helical_angles = []
        for idx in range(len(self.P.dfl)):
            pdb_id = self.P.dfl[idx].iloc[0]['PDB']
            helical_angles = calc_angles_between_helices(section_helices, idx)            
            self.angles_df_dict.update({pdb_id: helical_angles})
            list_helical_angles.append(helical_angles)
        std_df = pd.DataFrame(columns=rois)
        mean_df = pd.DataFrame(columns=rois)
        for r1, roi1 in enumerate(rois):
            for r2, roi2 in enumerate(rois):
                l_r1r2 = []
                for df in list_helical_angles:
                    cell = df.loc[roi1, roi2]
                    if cell > 0:
                        l_r1r2.append(cell)
                if len(l_r1r2) > 0:
                    mean_r1r2 = np.mean(l_r1r2)
                    std_r1r2 = np.std(l_r1r2)
                    std_df.loc[roi1, roi2] = std_r1r2
                    mean_df.loc[roi1, roi2] = mean_r1r2
        self.helical_angles_mean = mean_df
        self.helical_angles_std = std_df
        self.gs_count_df = count_g_positions(p.dfl)
        
        
    def query(self, mode, query_tag, poi, pdb_id=None, helices=None):
        if mode == 'dist':
            target = self.queries[(self.queries['mode'] == 'dist') &
                                  (self.queries['poi'] == poi) &
                                  (self.queries['query_tag'] == query_tag)]
            return target
        elif mode == 'helices':
            if helices == None:
                return self.helical_angles_mean, self.helical_angles_std, self.gs_count_df
            elif pdb_id!=None:
                return self.angles_df_dict[pdb_id].loc[helices[0], helices[1]]
        else:
            print("Valid query modes are 'dist' and 'helices'!")

In [13]:
A = StdAnalysis(P=p)

# Run Distance Analysis

In [14]:
A.run_dist_analysis(l=[filtered_indices_gs], query_tag='gs', poi=('G.H5.19', 7.58), eps=0.05)

# Run Helical Analysis

In [15]:
A.run_helical_analysis()

In [22]:
A.dist_df_dict.keys()

dict_keys(['7JVP', '7JV5', '7CX4', '7CX2', '7CX3', '6LI3', '7CFM', '7CFN', '7KH0', '3SN6', '7BW0', '7CKX', '7CKZ', '7CKW', '7CRH', '7CKY', '7JJO', '7BZ2', '7DHI', '7DHR', '7D7M', '7MBX'])

In [23]:
A.P.dfl_list.index('7MBY')

71

In [24]:
df = A.P.dfl[71]

In [25]:
df

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,...,identifier,label_comp_sid,label_2_uni,gen_pos,gen_pos1,gen_pos2,uniprot_comp_sid,gprot_pos,uniprot_comp_id,fam_comp_id
0,7MBY,ATOM,B,A,2,2,SER,1,N,N,...,CCKAR_HUMAN,S,0,,0.0,0.0,,,,
1,7MBY,ATOM,B,A,2,2,SER,2,CA,C,...,CCKAR_HUMAN,S,0,,0.0,0.0,,,,
2,7MBY,ATOM,B,A,2,2,SER,3,C,C,...,CCKAR_HUMAN,S,0,,0.0,0.0,,,,
3,7MBY,ATOM,B,A,2,2,SER,4,O,O,...,CCKAR_HUMAN,S,0,,0.0,0.0,,,,
4,7MBY,ATOM,B,A,2,2,SER,5,CB,C,...,CCKAR_HUMAN,S,0,,0.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6824,7MBY,ATOM,A,E,253,394,VAL,6842,O,O,...,CCKAR_HUMAN,V,0,,0.0,0.0,,,,
6825,7MBY,ATOM,A,E,253,394,VAL,6843,CB,C,...,CCKAR_HUMAN,V,0,,0.0,0.0,,,,
6826,7MBY,ATOM,A,E,253,394,VAL,6844,CG1,C,...,CCKAR_HUMAN,V,0,,0.0,0.0,,,,
6827,7MBY,ATOM,A,E,253,394,VAL,6845,CG2,C,...,CCKAR_HUMAN,V,0,,0.0,0.0,,,,


In [26]:
df = df[(df['auth_seq_id'].astype(int)>385) &
        (df['auth_seq_id'].astype(int)<395) &
        (df['label_atom_id'] == 'CA')]

In [27]:
df[['PDB', 'label_seq_id', 'auth_seq_id', 'label_comp_id', 'gprot_pos', 'label_comp_sid', 'uniprot_comp_id']]

Unnamed: 0,PDB,label_seq_id,auth_seq_id,label_comp_id,gprot_pos,label_comp_sid,uniprot_comp_id
5213,7MBY,385,386,PRO,,P,
6750,7MBY,245,386,MET,G.H5.18,M,M
6758,7MBY,246,387,ASN,,N,
6766,7MBY,247,388,LEU,G.H5.20,L,L
6774,7MBY,248,389,ARG,,R,
6785,7MBY,249,390,GLU,G.H5.22,E,Q
6794,7MBY,250,391,TYR,G.H5.23,Y,Y
6806,7MBY,251,392,ASN,G.H5.24,N,E
6814,7MBY,252,393,LEU,G.H5.25,L,L
6822,7MBY,253,394,VAL,G.H5.26,V,L


# Query reports

In [28]:
A.query(mode='dist', query_tag='', poi=('G.H5.19', 7.5), helices=None)

Unnamed: 0,mode,query_tag,pdb_id,poi,dist_poi


In [29]:
A.query(mode='helices', query_tag='', poi=(), pdb_id='3SN6', helices=['TM3', 'TM7'])

28.967516178066475