In [1]:
import pandas as pd
import lxml

In [2]:
# data = pd.read_html('data/couplings/families_coupling.xls')[0].drop('Unnamed: 0_level_0', axis=1)

In [3]:
# data

In [4]:
# data.columns

In [5]:
class AffinityProcessor:
    def __init__(self,
                 path='data/couplings/',
                 setting='subtypes'):
        self.source = path + setting + '_coupling.xls'
        self.setting = setting
        self.groups = ['GPCRdb', 'Inoue', 'Bouvier']
        self.label_types = ['Guide to Pharmacology', 'Log(Emax/EC50)', 'pEC50', 'Emax']
        print("Reading data from {}!".format(self.source))
        self.read_data()
        print("Initialized Affinity Processor!")
        print("Please set a group --------------  {}.".format(self.groups))
        print("please set label type -----------  {}.".format(self.label_types))
    
    def read_data(self):
        if self.setting == 'families':
            self.data = pd.read_html(self.source)[0].drop('Unnamed: 0_level_0', axis=1)
        else:
            self.data = pd.read_html(self.source)[0].drop('Unnamed: 0_level_0', axis=1, level=0)
    
    def set_group(self, group='GPCRdb'):
        assert group in self.groups, print("'{}' is not a valid group name, valid are {}.".format(group, self.groups))
        print("\nSelected data of group '{}'.\n".format(group))
        if self.setting == 'subtypes':
            self.data = self.data[self.data['Source', 'Group', 'Unnamed: 1_level_2']==group]
        else:
            self.data = self.data[self.data['Source', 'Group']==group]
            
            
    def set_label_type(self, label_type='Guide to Pharmacology'):
        assert label_type in self.label_types, print("'{}' is not a valid label type, valid are {}."\
                                                     .format(label_type, self.label_types))
        print("\nSelected label type '{}'.\n".format(label_type))
        to_drop = [x for x in self.label_types if x != label_type]
        if self.setting == 'subtypes':
            self.data = self.data.drop(to_drop, axis=1)
        else:
            self.data = self.data.drop(to_drop, axis=1)
            
        
    def make_label_dict(self):
        label_dict = {}
        return label_dict

In [6]:
A = AffinityProcessor(setting='families')

Reading data from data/couplings/families_coupling.xls!
Initialized Affinity Processor!
Please set a group --------------  ['GPCRdb', 'Inoue', 'Bouvier'].
please set label type -----------  ['Guide to Pharmacology', 'Log(Emax/EC50)', 'pEC50', 'Emax'].


  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [7]:
label_type = 'Log(Emax/EC50)'
A.set_label_type(label_type)
A.set_group()


Selected label type 'Log(Emax/EC50)'.


Selected data of group 'GPCRdb'.



In [8]:
A.data

Unnamed: 0_level_0,Source,Source,Receptor,Receptor,Receptor,Receptor,Log(Emax/EC50),Log(Emax/EC50),Log(Emax/EC50),Log(Emax/EC50)
Unnamed: 0_level_1,Group,Biosensor,Cl,Receptor fam.,Uniprot,IUPHAR,Gs,Gi/o,Gq/11,G12/13
0,GPCRdb,Max,A,5-Hydroxytryptamine,5HT1A,5-HT1A,-,8.4,6.7,-
3,GPCRdb,Max,A,5-Hydroxytryptamine,5HT1B,5-HT1B,-,8.9,6.7,-
6,GPCRdb,Max,A,5-Hydroxytryptamine,5HT1D,5-HT1D,-,8.7,-,-
9,GPCRdb,Max,A,5-Hydroxytryptamine,5HT1E,5-ht1E,-,7.0,-,-
11,GPCRdb,Max,A,5-Hydroxytryptamine,5HT1F,5-HT1F,-,6.8,-,-
...,...,...,...,...,...,...,...,...,...,...
407,GPCRdb,Max,C,Metabotropic glutamate,GRM2,mGlu2,-,5.3,-,-
409,GPCRdb,Max,C,Metabotropic glutamate,GRM4,mGlu4,-,4.4,-,-
411,GPCRdb,Max,C,Metabotropic glutamate,GRM5,mGlu5,2',5.4,5.8,-
413,GPCRdb,Max,C,Metabotropic glutamate,GRM6,mGlu6,-,4.7,-,-


In [9]:
data1 = pd.read_html('data/couplings/subtypes_coupling.xls')[0].drop('Unnamed: 0_level_0', axis=1)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [10]:
data1[data1['Source', 'Group', 'Unnamed: 1_level_2']=='GPCRdb']

Unnamed: 0_level_0,Source,Source,Receptor,Receptor,Receptor,Receptor,Guide to Pharmacology,Guide to Pharmacology,Guide to Pharmacology,Guide to Pharmacology,...,Emax,Emax,Emax,Emax,Emax,Emax,Emax,Emax,Emax,Emax
Unnamed: 0_level_1,Group,Biosensor,Cl,Receptor fam.,Uniprot,IUPHAR,Gs,Gi/o,Gq/11,G12/13,...,Gi/o,Gi/o,Gi/o,Gi/o,Gq/11,Gq/11,Gq/11,Gq/11,G12/13,G12/13
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,...,GNAI2,GNAI3,GNAZ,GNAO,GNAQ,GNA11,GNA14,GNA15,GNA12,GNA13
0,GPCRdb,Mean,A,5-Hydroxytryptamine,5HT1A,5-HT1A,-,1',-,-,...,50,39,44,52,-,-,27,52,-,-
3,GPCRdb,Mean,A,5-Hydroxytryptamine,5HT1B,5-HT1B,-,1',-,-,...,23,26,30,20,-,-,-,-,-,-
6,GPCRdb,Mean,A,5-Hydroxytryptamine,5HT1D,5-HT1D,-,1',-,-,...,20,17,33,26,-,-,-,-,-,-
9,GPCRdb,Mean,A,5-Hydroxytryptamine,5HT1E,5-ht1E,-,1',-,-,...,16,25,15,13,-,-,-,-,-,-
11,GPCRdb,Mean,A,5-Hydroxytryptamine,5HT1F,5-HT1F,-,1',-,-,...,16,28,1',2,-,-,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,GPCRdb,Mean,C,Metabotropic glutamate,GRM2,mGlu2,-,1',-,-,...,21,1',9,21,-,-,-,-,-,-
409,GPCRdb,Mean,C,Metabotropic glutamate,GRM4,mGlu4,-,1',-,-,...,1',1',1',14,-,-,-,-,-,-
411,GPCRdb,Mean,C,Metabotropic glutamate,GRM5,mGlu5,2',2',1',-,...,33,2',28,40,19,29,25,45,-,-
413,GPCRdb,Mean,C,Metabotropic glutamate,GRM6,mGlu6,-,1',-,-,...,31,1',25,24,-,-,-,-,-,-


In [None]:
from bs4 import BeautifulSoup

"""
with open(path) as f:
    text = f.read()
BeautifulSoup(text, 'lxml')"""

In [None]:

def get_families(path = 'data/families.txt'):
    with open(path) as f:
        cols = ['f1', 'v1', 'f2', 'v2', 'f3', 'v3', 'f4', 'v4']
        family_df = pd.DataFrame(columns=cols)
        v0_ = 0
        v1_ = 0
        v2_ = 0
        v3_ = 0
        for row in f.readlines():
            if not '    ' in row:
                v0 = row.split('|')[0][:-1]
                v0_ += 1
                v1_ = 0
                v2_ = 0
                v3_ = 0
            elif not '        ' in row:
                v1 = row.replace("\n", "")[4:]
                v1_ += 1
                v2_ = 0
                v3_ = 0
            elif not '            ' in row:
                v2 = row.replace("        ", "").replace("\n", "")
                v2_ += 1
                v3_ = 0
            else:
                try:
                    v3 = row.split(',')[8]
                    v3_ += 1
                except:
                    v3 = row.split(',')
                    v3_ += 1
                dict_ = dict(zip(cols, [v0, v0_, v1, v1_, v2, v2_, v3, v3_]))
                family_df=family_df.append(dict_, ignore_index=True)
    return family_df
            

In [None]:
family_df.to_pickle('data/families.pkl')

In [None]:
family_df

In [11]:
from processor import *
from analysis import *
import numpy as np
from scipy.spatial.distance import cdist, pdist
import seaborn as sns
from matplotlib import rcParams
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [12]:
p = CifProcessor()
p.read_pkl_metainfo()
p.read_pkl(mode='r', folder='data/processed/')

Reading files with generic numbers on receptors.


In [13]:
f_act = p.filter_dfl_via_table(State='Active')
l_inact = [x for x in p.dfl_list if x not in list(f_act['PDB'])]
f_inact = p.filter_dfl_via_table(pdb_ids = l_inact)
print("active: ",len(f_act))
print("inactive: ",len(f_inact))

active:  210
inactive:  335


In [14]:
p.apply_filter(f_inact)

In [15]:
df = p.dfl[0]

In [16]:
def get_invalid_pdbs(p, A):
    genes = []
    for i in range(len(p.dfl)):
        idf = p.dfl[i]['identifier'].iloc[0]
        if idf not in genes:
            genes.append(idf)
    has_missing_affinity = []
    for g in genes:
        g_, _g = g.split('_')
        if _g == 'HUMAN':
            if g_ not in A.data['Receptor', 'Uniprot'].to_list():
                has_missing_affinity.append(g_)
    invalid_pdb = []
    for i in range(len(p.dfl)):
        idf = p.dfl[i]['identifier'].iloc[0]
        pdb = p.dfl[i]['PDB'].iloc[0]
        g_, _g = idf.split('_')
        if (g_ in has_missing_affinity) or (_g != 'HUMAN'):
            invalid_pdb.append(pdb)
    return invalid_pdb

In [18]:
invalid_pdb = get_invalid_pdbs(p, A)

In [22]:
print("number of invalid pdbs: {} / {}".format(len(invalid_pdb), len(p.dfl_list)))

number of invalid pdbs: 87 / 334


In [23]:
filtered_dfl_list = [x for x in p.dfl_list if x not in invalid_pdb]

In [26]:
f_labelled = p.filter_dfl_via_table(pdb_ids = filtered_dfl_list)

In [27]:
p.apply_filter(f_labelled)

In [28]:
df = p.dfl[0]

In [29]:
df[(df['gen_pos1'] > 7.45) & 
   (df['gen_pos1'] <= 7.58)]

Unnamed: 0,PDB,group_PDB,auth_asym_id,label_asym_id,label_seq_id,auth_seq_id,label_comp_id,id,label_atom_id,type_symbol,...,phi,omega,psi,identifier,label_comp_sid,label_2_uni,gen_pos,gen_pos1,gen_pos2,uniprot_comp_sid
1389,2R4R,ATOM,A,A,319,319,SER,1390,CA,C,...,-97.403571,179.446068,-20.458291,ADRB2_HUMAN,S,319,7.46x46,7.46,46.0,S
1394,2R4R,ATOM,A,A,320,320,GLY,1395,CA,C,...,-127.168961,-179.52316,27.703692,ADRB2_HUMAN,G,320,7.47x47,7.47,47.0,G
1398,2R4R,ATOM,A,A,321,321,PHE,1399,CA,C,...,-111.37549,179.819154,-25.462327,ADRB2_HUMAN,F,321,7.48x48,7.48,48.0,F
1403,2R4R,ATOM,A,A,322,322,ASN,1404,CA,C,...,-76.42137,179.638413,-53.427978,ADRB2_HUMAN,N,322,7.49x49,7.49,49.0,N
1411,2R4R,ATOM,A,A,323,323,PRO,1412,CA,C,...,-72.211379,179.652921,8.619921,ADRB2_HUMAN,P,323,7.50x50,7.5,50.0,P
1418,2R4R,ATOM,A,A,324,324,LEU,1419,CA,C,...,-104.586977,179.83308,-35.297204,ADRB2_HUMAN,L,324,7.51x51,7.51,51.0,L
1426,2R4R,ATOM,A,A,325,325,ILE,1427,CA,C,...,-85.831023,-179.749003,-13.37391,ADRB2_HUMAN,I,325,7.52x52,7.52,52.0,I
1431,2R4R,ATOM,A,A,326,326,TYR,1432,CA,C,...,-82.256331,179.152258,-8.385474,ADRB2_HUMAN,Y,326,7.53x53,7.53,53.0,Y
1436,2R4R,ATOM,A,A,327,327,CYS,1437,CA,C,...,-80.74023,-179.884902,-64.121456,ADRB2_HUMAN,C,327,7.54x54,7.54,54.0,C
1442,2R4R,ATOM,A,A,328,328,ARG,1443,CA,C,...,-91.043323,178.556435,43.495893,ADRB2_HUMAN,R,328,7.55x55,7.55,55.0,R


In [106]:
def get_label(df, A, label_type):
    """
    This definitely needs some looking into!
    """
    keys = ['Gs', 'Gi/o', 'Gq/11', 'G12/13']
    idf = df['identifier'].iloc[0]
    gene, _ = idf.split('_')
    row = A.data[A.data['Receptor', 'Uniprot'] == gene]
    values = list(row[label_type].iloc[0])
    values = [float(x) if (not "'" in x) and (not '-' in x) else 0 for x in values]
    return dict((zip(keys, values)))

In [107]:
def make_label_df(p, A, label_type):
    p.labels = []
    for i in range(len(p.dfl)):
        d = get_label(p.dfl[i], A, label_type)
        p.labels.append(d)
    label_df = pd.DataFrame(p.labels)
    label_df['PDB'] = p.dfl_list
    return label_df

In [108]:
label_df = make_label_df(p, A, label_type)

In [109]:
label_df

Unnamed: 0,Gs,Gi/o,Gq/11,G12/13,PDB
0,8.2,0.0,7.0,0.0,2R4R
1,8.2,0.0,7.0,0.0,2R4S
2,8.2,0.0,7.0,0.0,2RH1
3,8.2,0.0,7.0,0.0,3D4S
4,5.7,0.0,5.6,0.0,3EML
...,...,...,...,...,...
242,5.7,0.0,5.6,0.0,7ARO
243,0.0,8.9,9.4,0.0,7BR3
244,0.0,8.9,6.7,0.0,7C61
245,0.0,8.4,0.0,0.0,7DFP
