In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = 5,5
plt.rcParams['axes.linewidth'] = 2.0
plt.rcParams['savefig.dpi'] = 500
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42
np.set_printoptions(legacy='1.25')

### Import Data

In [3]:
os.chdir("/home/cadeniran/ipn/data")
proteins=pd.read_csv("proteins-2024-08-02.csv", header=0, sep=',', engine='python')
spfam = pd.read_csv("superfamilies-2024-08-02.csv", usecols=['ordering', 'name'])

### Data Explorataion

In [4]:
spfam = spfam.rename(columns={'ordering': 'superfamily_id', 'name': 'superfamily_name'})
df = proteins.merge(spfam[['superfamily_id', 'superfamily_name']], how='left', on='superfamily_id')
df.columns

Index(['id', 'ordering', 'family_name_cache', 'species_name_cache',
       'membrane_name_cache', 'name', 'description', 'comments', 'pdbid',
       'resolution', 'topology_subunit', 'topology_show_in', 'thickness',
       'thicknesserror', 'subunit_segments', 'tilt', 'tilterror', 'gibbs',
       'tau', 'verification', 'membrane_id', 'species_id', 'family_id',
       'superfamily_id', 'classtype_id', 'type_id',
       'secondary_representations_count', 'structure_subunits_count',
       'citations_count', 'created_at', 'updated_at', 'uniprotcode',
       'interpro', 'superfamily_name'],
      dtype='object')

In [5]:
print(df.dtypes)

id                                   int64
ordering                           float64
family_name_cache                   object
species_name_cache                  object
membrane_name_cache                 object
name                                object
description                         object
comments                            object
pdbid                               object
resolution                          object
topology_subunit                    object
topology_show_in                      bool
thickness                          float64
thicknesserror                     float64
subunit_segments                     int64
tilt                                 int64
tilterror                          float64
gibbs                              float64
tau                                float64
verification                        object
membrane_id                          int64
species_id                           int64
family_id                            int64
superfamily

In [6]:
print("Type:", df['type_id'].max())
print("Class:", df['classtype_id'].max())
print("Membrane:", df['membrane_id'].max())
print("Superfamily:", df['superfamily_id'].max())
print("Species:", df['species_id'].max())
print("Family:", df['family_id'].max())

Type: 3
Class: 11
Membrane: 24
Superfamily: 607
Species: 1150
Family: 1240


In [7]:
print(df['classtype_id'].value_counts())

classtype_id
1     5450
3      634
11     568
7      520
2      459
5      372
4      366
6      294
9      150
10      85
8       17
Name: count, dtype: int64


In [8]:
print(df['type_id'].value_counts())

type_id
1    6477
2    1666
3     772
Name: count, dtype: int64


In [9]:
df['pdbid'] = df['pdbid'].str.replace('=','')
df['pdbid'] = df['pdbid'].str.replace('"','')

In [10]:
df.isnull().sum()

id                                    0
ordering                              0
family_name_cache                     0
species_name_cache                    0
membrane_name_cache                   0
name                                  0
description                        8914
comments                           7714
pdbid                                 0
resolution                           36
topology_subunit                   2262
topology_show_in                      0
thickness                             0
thicknesserror                       35
subunit_segments                      0
tilt                                  0
tilterror                            29
gibbs                                 0
tau                                8765
verification                       8693
membrane_id                           0
species_id                            0
family_id                             0
superfamily_id                        0
classtype_id                          0


### Sequential Removal

In [None]:
#Remove structures from NMR with spaces
df.drop(df.loc[df['resolution'] == "NMR    "].index, inplace=True)
len(df.index)

In [None]:
#Remove structures from NMR
df.drop(df.loc[df['resolution'] == "NMR"].index, inplace=True)
len(df.index)

In [None]:
#Remove structures with strange resolutions
df.drop(df.loc[df['resolution'] == "EC"].index, inplace=True)
df.drop(df.loc[df['resolution'] == "FD"].index, inplace=True)
len(df.index)

In [None]:
# Replace strange resolutions entries with text value
df['resolution'] = df['resolution'].str.replace(' EM','')
df['resolution'] = df['resolution'].str.replace(' EC','')
df['resolution'] = df['resolution'].str.replace(' E','')
df['resolution'] = df['resolution'].str.replace(' FD','')
df['resolution'] = df['resolution'].str.replace(' ND','')
len(df.index)

In [None]:
df.drop(df.loc[df['resolution'] == ""].index, inplace=True)
len(df.index)

In [None]:
df.drop(df.loc[df['resolution'] == '    '].index, inplace=True)
len(df.index)

In [None]:
# Remove rows with empy resolutions
df.dropna(subset=['resolution'])

In [None]:
# Convert the Column from String to Float
df['resolution'] = df['resolution'].astype(float)

In [None]:
print(df.dtypes)

In [None]:
#Remove high resolutions
df = df[df['resolution'] < 4]
len(df.index)

In [None]:
#Remove peptides
df = df.drop(df[df['type_id'] == 3].index)
len(df.index)

In [None]:
print(df['type_id'].value_counts())

In [None]:
df['resolution'].agg(['min', 'max'])

In [None]:
df.to_csv('passive1.csv', sep=',')

### Strict Removal

In [11]:
# Remove ALL non numeric entries in resolution
df = df[pd.to_numeric(df['resolution'], errors='coerce').notnull()]
len(df.index)

3679

In [12]:
# Convert the Resolution Column from String to Float
df['resolution'] = df['resolution'].astype(float)

In [13]:
print(df.dtypes)

id                                   int64
ordering                           float64
family_name_cache                   object
species_name_cache                  object
membrane_name_cache                 object
name                                object
description                         object
comments                            object
pdbid                               object
resolution                         float64
topology_subunit                    object
topology_show_in                      bool
thickness                          float64
thicknesserror                     float64
subunit_segments                     int64
tilt                                 int64
tilterror                          float64
gibbs                              float64
tau                                float64
verification                        object
membrane_id                          int64
species_id                           int64
family_id                            int64
superfamily

In [14]:
#Remove peptides
df = df.drop(df[df['type_id'] == 3].index)
len(df.index)

3580

In [15]:
#Remove high resolutions
df = df[df['resolution'] < 4]
len(df.index)

3502

In [16]:
df['resolution'].agg(['min', 'max'])

min    0.54
max    3.99
Name: resolution, dtype: float64

In [17]:
print(df['type_id'].value_counts())

type_id
1    2279
2    1223
Name: count, dtype: int64


In [18]:
df.to_csv('aggressive1.csv', sep=',')

### Analyze duplicates

In [19]:
# Select resolution entries based on lowest value
idxmin = df.groupby(['name'])['resolution'].idxmin()
df2 = df.loc[idxmin]
len(df2.index)

3223

### Select Superfamilies

#### Filter based on number of entries

In [20]:
# Count total entries of Superfamilies
vc = df2['superfamily_id'].value_counts()
print(vc)

superfamily_id
6      344
8      171
127    123
39     111
15      91
      ... 
571      1
416      1
291      1
63       1
315      1
Name: count, Length: 373, dtype: int64


In [21]:
df2[df2.duplicated(['superfamily_id'])]

Unnamed: 0,id,ordering,family_name_cache,species_name_cache,membrane_name_cache,name,description,comments,pdbid,resolution,...,classtype_id,type_id,secondary_representations_count,structure_subunits_count,citations_count,created_at,updated_at,uniprotcode,interpro,superfamily_name
1955,2015,6862.0,Retinal pigment epithelial membrane protein,Bos taurus,Eykaryo. plasma,"Retinoid isomerohydrolase, monomer, structure 2",,,4f2z,3.000,...,3,2,0,0,0,2018-08-13 03:53:24 UTC,2020-05-10 22:04:39 UTC,RPE65_BOVIN,,Designed polytopic proteins
1956,2016,6863.0,Retinal pigment epithelial membrane protein,Bos taurus,Eykaryo. plasma,"Retinoid isomerohydrolase, monomer, structure 3",,,4f30,3.150,...,3,2,0,0,0,2018-08-13 03:53:24 UTC,2020-05-10 22:04:39 UTC,RPE65_BOVIN,,Designed polytopic proteins
1807,1859,7487.0,Glucanosyltransferase,Saccharomyces cerevisiae,Eykaryo. plasma,"1,3-beta-glucanosyltransferase, structure 2",,,2w62,1.850,...,5,2,1,0,0,2018-08-13 03:53:04 UTC,2020-05-10 22:04:13 UTC,GAS2_YEAST,,Piezo family
2938,3012,7488.0,Glucanosyltransferase,Saccharomyces cerevisiae,Eykaryo. plasma,"1,3-beta-glucanosyltransferase, structure 3",,,5fih,1.800,...,5,2,0,0,0,2018-08-13 03:54:34 UTC,2020-05-11 00:17:07 UTC,GAS2_YEAST,,Piezo family
3751,3828,7489.0,Glucanosyltransferase,Saccharomyces cerevisiae,Eykaryo. plasma,"1,3-beta-glucanosyltransferase, structure 4",,,5o9o,1.900,...,5,2,7,0,0,2018-08-13 03:55:15 UTC,2020-05-10 22:10:03 UTC,GAS2_YEAST,,Piezo family
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2208,2270,7465.0,Mannose-binding lectin,Homo sapiens,Secreted,Zymogen granule membrane protein 16,,,3vy7,2.140,...,3,2,5,0,0,2018-08-13 03:53:48 UTC,2020-05-11 00:16:38 UTC,ZG16_HUMAN,,Phosphotyrosine protein phosphatases II
675,681,7355.0,Snake venom toxins,Naja kaouthia,Secreted,alpha-Cobratoxin,,,1ctx,2.800,...,3,2,4,0,0,2018-08-13 03:50:41 UTC,2020-05-13 20:44:01 UTC,3L21_NAJKA,,"ClpP/Crotonase, transmembrane"
513,517,7937.0,Plant thionin,Triticum aestivum,Secreted,beta-Purothionin,,,1bhp,1.700,...,6,2,0,0,0,2018-08-13 03:50:22 UTC,2020-05-10 21:59:50 UTC,THNB_WHEAT,,KX blood-group antigen family
7090,8508,248.0,"G-protein coupled receptors, family A",Homo sapiens,Eykaryo. plasma,cannabinoid receptor type 1(CB1),,,7v3z,3.290,...,1,1,0,1,0,2023-02-18 04:51:41 UTC,2023-02-22 22:07:21 UTC,CNR1_HUMAN FLAV_DESVH,,Electron transport chain complex IV (cytochrom...


In [22]:
# Count total entries of Families
vc = df2['family_name_cache'].value_counts()
#print(vc)

In [23]:
vc.reset_index()
vc.to_csv('famnamecount.csv', sep=',')

In [24]:
# Filter entries on number of members in superfam
df2['superfamily_name'].value_counts().loc[lambda x : x >= 7].sum()

2594

In [25]:
selectd = df2['superfamily_name'].value_counts().loc[lambda x : x >= 7]

In [26]:
df2['superfamily_id'].value_counts().loc[lambda x : x >= 7].sum()

2628

In [27]:
dump = df2['superfamily_id'].value_counts().loc[lambda x : x < 7 ].reset_index()
drop_list = dump['superfamily_id'].tolist()
df3 = df2[~df2['superfamily_id'].isin(drop_list)]

In [28]:
len(df3.index)

2628

In [29]:
df3.to_csv('selectd.csv', sep=',')

In [30]:
df3['pdbid'].to_csv('pdbid', sep=' ', index=False, header=False)

#### Filter most drugged superfamily | Remove problematic structures

In [31]:
df3 = df2[df2['family_name_cache'].str.contains('|'.join(['G-protein', 'GPCR', 'channel', 'gated', 
                                                    'kinase', 'transporter', 'ATP', 'lipase', 
                                                    'cyclase', 'Annexins', 'Apolipoprotein', 
                                                    'transfer', 'domain']))]

In [32]:
df2['family_name_cache'].value_counts().loc[lambda x : x >= 20]

family_name_cache
G-protein coupled receptors, family A               240
Cytochrome P450                                     111
P-ATPase                                             70
Microbial and algal rhodopsins                       67
Vertebrate secretory phospholipase A2                56
Outer Membrane Receptor (OMR)                        54
C2 domain                                            48
General Bacterial Porin (GBP)                        42
Cytochrome bc1 and b6f complexes                     38
Two pore Na+ channels                                34
Lipid exporter family                                34
Fatty acid binding protein-like (n=10)               34
Pleckstrin-homology domain                           33
Voltage-gated calcium and sodium channels            30
Neurotransmitter: sodium symporter                   28
Prokaryotic pentameric ligand-gated ion channels     28
Major intrinsic protein (MIP) family                 26
KcsA voltage-gated K+ channels

In [33]:
df3.to_csv('selectd.csv', sep=',')

In [34]:
df3['pdbid'].to_csv('pdbid', sep=' ', index=False, header=False)

In [35]:
# Should get 1422 entries
len(df3.index)

1422

### Separate TM and Peripheral

In [36]:
# Remove probelmatic structures
df_rescount=pd.read_csv("rescount.csv", sep=',', engine='python', names = ["pdbid", "res_count"])
df4 = pd.merge(df3, df_rescount, how='inner', on='pdbid')
df4['pdbid'].to_csv('pdbid', sep=' ', index=False, header=False)
# Separate TM & peripheral
df_per = df4.drop(df4[df4['type_id'] == 1].index)
df_tm = df4.drop(df4[df4['type_id'] == 2].index)
# Save all transmembrane to CSV
df_tm['pdbid'].to_csv('pdbid_tm', sep=' ', index=False, header=False)
# Save all peripheral to CSV
df_per['pdbid'].to_csv('pdbid_per', sep=' ', index=False, header=False)

#### Filter proteins with < 400 res 

In [38]:
# Protein less than 400 residues
df5 = df4.drop(df4[df4['res_count'] > 400].index)
# Separate TM & peripheral
df_per = df5.drop(df5[df5['type_id'] == 1].index)
df_tm = df5.drop(df5[df5['type_id'] == 2].index)
# Randomly sample elements from the dataframe
df_tm32 = df_tm.sample(n=6)
df_per32 = df_per.sample(n=6)
# Save to CSV
df_tm32['pdbid'].to_csv('pdbid_tm6', sep=' ', index=False, header=False)
df_per32['pdbid'].to_csv('pdbid_per6', sep=' ', index=False, header=False)

## Testing

In [None]:
removd = df2['superfamily_name'].value_counts().loc[lambda x : x < 34]

In [None]:
print(removd)

In [None]:
removd.to_csv('removed.csv', sep=',')

In [None]:
selectd = df2['superfamily_name'].value_counts().loc[lambda x : x >= 34]

In [None]:
df2['family_name_cache'].value_counts().loc[lambda x : x >= 13]

In [None]:
test = df2['family_name_cache'].value_counts().loc[lambda x : x < 13]

In [None]:
os.chdir("/Users/chuck/Downloads")
test.to_csv('test1.csv', sep=',')

In [None]:
ff = df2.value_counts().rename_axis('superfamily_id').to_frame('counts')

In [None]:
df['pdbid'].replace('=','', regex=True, inplace=True)
df['pdbid'] = df['pdbid'].str.replace('"','')

In [None]:
df3.replace('\"', '', regex = True, inplace = True)

In [None]:
df_probs=pd.read_csv("problems", sep=',', engine='python', names = ["pdbid"])

In [None]:
df_rescount=pd.read_csv("rescount.csv", sep=',', engine='python', names = ["pdbid", "res_count"])

In [None]:
tdf = pd.merge(df3, df_probs, how='outer', indicator=True)

In [None]:
#tdf
len(tdf.index)

In [None]:
print(tdf.loc[tdf._merge == 'left_only', ['pdbid']])

In [None]:
df6 = pd.merge(df3, df_rescount, how='inner', on='pdbid')

In [None]:
df6

In [None]:
len(df6.index)

In [None]:
df3