## Obtaining the FASTA or SMILES sequences for each CIDm chemical and corresponding proteins
After filtering the original dataset to only include CIDm chemicals and the corresponding binding proteins (see Notebook #3), it was necessary to map each of those ids to the respective SMILES or FASTA sequence. This information will be used later to calculate molecular feature descriptors for both types of molecules.

In [2]:
import pandas as pd

In [5]:
pc_binding = pd.read_table('pc_binding.tsv') 
pc_binding = pc_binding.drop(['Unnamed: 0'], axis=1) 

In [9]:
CIDm_binding = pc_binding.drop_duplicates('chemical') 
# takes binding pairs set and drops duplicate chemical IDs (goal is to have list of unique chemicals) 

In [14]:
CIDm_binding = CIDm_binding.drop(['protein', 'experimental'], axis=1) 
CIDm_binding = CIDm_binding.reset_index(drop=True) 
# drops all columns except chemical and resets index (IDs are already ordered) 

In [16]:
chemSMILES = pd.read_table('chemicals.v5.0.tsv.gz') 
# recall this is the master dataset of all chemicals and their info/SMILES for humans

In [58]:
len(chemSMILES) 

51533835

In [17]:
chemSMILES.head(5) 

Unnamed: 0,chemical,name,molecular_weight,SMILES_string
0,CIDs00000001,acetylcarnitine,203.23558,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,CIDs00000003,"2,3-dihydro-2,3-dihydroxybenzoate",156.13602,C1=CC(C(C(=C1)C(=O)O)O)O
2,CIDs00000004,1-aminopropan-2-ol,75.10966,CC(CN)O
3,CIDs00000005,3-amino-2-oxopropyl phosphate,169.073082,C(C(=O)COP(=O)(O)O)N
4,CIDs00000006,DNCB,202.55202,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl


In [19]:
chemSMILES = chemSMILES.drop(['name', 'molecular_weight'], axis=1) 
# keep only chemical and SMILES columns

In [20]:
chemSMILES.head(5) 

Unnamed: 0,chemical,SMILES_string
0,CIDs00000001,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C
1,CIDs00000003,C1=CC(C(C(=C1)C(=O)O)O)O
2,CIDs00000004,CC(CN)O
3,CIDs00000005,C(C(=O)COP(=O)(O)O)N
4,CIDs00000006,C1=CC(=C(C=C1[N+](=O)[O-])[N+](=O)[O-])Cl


In [21]:
chemSMILES = chemSMILES[~chemSMILES['chemical'].str.contains("CIDs")] 
# filters out CIDs since our set is only CIDm (makes later merge faster) 

In [23]:
chemSMILES = chemSMILES.reset_index(drop=True) 
# reset index (already ordered) 

In [27]:
CIDm_binding = pd.merge(CIDm_binding, chemSMILES, on=['chemical']) 
# merge to get list of our unique chemicals and their SMILES

In [28]:
len(CIDm_binding) 
# 2 CIDm did not have a SMILES in the master list (?) 

259908

In [31]:
CIDm_binding.columns = ['chemical', 'SMILES'] 
# rename columns

In [33]:
CIDm_binding.head() 

Unnamed: 0,chemical,SMILES
0,CIDm00000007,CCN1C=NC2=C1N=CN=C2N
1,CIDm00000009,C1(C(C(C(C(C1O)O)OP(=O)(O)O)O)O)O
2,CIDm00000011,C(CCl)Cl
3,CIDm00000015,CC12CCC(=O)CC1CCC3C2CCC4(C3CCC4O)C
4,CIDm00000016,C1CCC(=O)NCCCCCC(=O)NCC1


In [56]:
CIDm_binding.to_csv('CIDm_binding.tsv', sep='\t') 

In [34]:
protm_binding = pc_binding.drop_duplicates('protein') 
# gets list of unique proteins from binding pairs set

In [35]:
len(protm_binding) 

9915

In [37]:
protm_binding = protm_binding.drop(['chemical', 'experimental'], axis=1) 
# keep only protein column

In [43]:
protm_binding = protm_binding.sort_values(by=['protein']) 
# sorts in accending order

In [44]:
protm_binding = protm_binding.reset_index(drop=True) 
# reset index

In [46]:
from Bio import SeqIO

In [47]:
with open('9606.protein.sequences.v10.5.fa') as fasta_file:
    protIDs = []
    sequences = []
    lengths = [] 
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):
        protIDs.append(str(seq_record.id)) 
        sequences.append(str(seq_record.seq))
        lengths.append(len(seq_record.seq)) 
# parses FASTA file and sends output to lists (of IDs, sequences, and lengths of sequences) 

In [48]:
s1 = pd.Series(protIDs, name='protein')
s2 = pd.Series(sequences, name='FASTA')
s3 = pd.Series(lengths, name='length') 
# takes each list and turns it into a pandas series

In [49]:
protseq = pd.DataFrame(dict(protein=s1, FASTA=s2, length=s3)) 
# first turns the series into entries of a dictionary and then turns this dictionary into a dataframe with each of the series as a column

In [51]:
protm_binding = pd.merge(protm_binding, protseq, on=['protein']) 
# merge our unique proteins with master set with FASTA for all proteins in human body

In [52]:
protm_binding
# now we have our proteins and their FASTA

Unnamed: 0,protein,FASTA,length
0,9606.ENSP00000000233,MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI...,180
1,9606.ENSP00000000412,MFPFYSCWRTGLLLLLLAVAVRESWQTEEKTCDLVGEKGKESEKEL...,277
2,9606.ENSP00000000442,MSSQVVGIEPLYIKAEPASPDSPKGSSETETEPPVALAPGPAPTRC...,423
3,9606.ENSP00000001008,MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM...,459
4,9606.ENSP00000001146,MLFEGLDLVSALATLAACLVSVTLLLAVSQQLWQLRWAATRDKSCK...,512
5,9606.ENSP00000002125,MSVLLRSGLGPLCAVARAAIPFIWRGKYFSSGNEPAENPVTPMLRH...,441
6,9606.ENSP00000002165,MRPQELPRLAFPLLLLLLLLLPPPPCPAHSATRFDPTWESLDARQL...,467
7,9606.ENSP00000002596,MAALLLGAVLLVAQPQLVPSRPAELGQQELLRKAGTLQDDVRDGVA...,307
8,9606.ENSP00000002829,MLVAGLLLWASLLTGAWPSFPTQDHLPATPRVRLSFKELKATGTAH...,785
9,9606.ENSP00000003084,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...,1480


In [54]:
protm_binding = protm_binding.drop(['length'], axis=1) 
# remove length column

In [55]:
protm_binding

Unnamed: 0,protein,FASTA
0,9606.ENSP00000000233,MGLTVSALFSRIFGKKQMRILMVGLDAAGKTTILYKLKLGEIVTTI...
1,9606.ENSP00000000412,MFPFYSCWRTGLLLLLLAVAVRESWQTEEKTCDLVGEKGKESEKEL...
2,9606.ENSP00000000442,MSSQVVGIEPLYIKAEPASPDSPKGSSETETEPPVALAPGPAPTRC...
3,9606.ENSP00000001008,MTAEEMKATESGAQSAPLPMEGVDISPKQDEGVLKVIKREGTGTEM...
4,9606.ENSP00000001146,MLFEGLDLVSALATLAACLVSVTLLLAVSQQLWQLRWAATRDKSCK...
5,9606.ENSP00000002125,MSVLLRSGLGPLCAVARAAIPFIWRGKYFSSGNEPAENPVTPMLRH...
6,9606.ENSP00000002165,MRPQELPRLAFPLLLLLLLLLPPPPCPAHSATRFDPTWESLDARQL...
7,9606.ENSP00000002596,MAALLLGAVLLVAQPQLVPSRPAELGQQELLRKAGTLQDDVRDGVA...
8,9606.ENSP00000002829,MLVAGLLLWASLLTGAWPSFPTQDHLPATPRVRLSFKELKATGTAH...
9,9606.ENSP00000003084,MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSVDSA...


In [57]:
protm_binding.to_csv('protm_binding.tsv', sep='\t') 