In [1]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("human_proteome.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

In [12]:
protein_df

Unnamed: 0,info,sequence
0,sp|A0A0B4J2A2|PAL4C_HUMAN Peptidyl-prolyl cis-...,MVNSVVFFDITVDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
1,tr|A0A1B0GTG8|A0A1B0GTG8_HUMAN Uncharacterized...,MWPLWHAPSSGEANVTLAMALFTILTSIYFFNKAQQ
2,sp|A1L190|SYCE3_HUMAN Synaptonemal complex cen...,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...
3,sp|A6NC57|ANR62_HUMAN Ankyrin repeat domain-co...,MEVRGSFLAACRRRMATWRKNRDKDGFSNPGYRVRQKDLGMIHKAA...
4,sp|A6NJR5|SPDL3_HUMAN Putative speedy protein-...,MQKHYTVAWFLYSAPGVDPSPPCRSLGWKRKKEWSDESEEEPEKEL...
...,...,...
20593,sp|Q9UBX5|FBLN5_HUMAN Fibulin-5 OS=Homo sapien...,MPGIKRILTVTILALCLPSPGNAQAQCTNGFDLDRQSGQCLDIDEC...
20594,sp|Q9UBY0|SL9A2_HUMAN Sodium/hydrogen exchange...,MEPLGNWRSLRAPLPPMLLLLLLQVAGPVGALAETLLNAPRAMGTS...
20595,sp|Q9UER7|DAXX_HUMAN Death domain-associated p...,MATANSIIVLDDDDEDEAAAQPGPSHPLPNAASPGAEAPSSSEPHG...
20596,"sp|Q9UKI9|PO2F3_HUMAN POU domain, class 2, tra...",MVNLESMHTDIKMSGDVADSTDARSTLSQVEPGNDRNGLDFNRQIK...


In [18]:
def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframe with original df to get sequence
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

In [14]:
dfa = info_parser(protein_df)

In [15]:
dfb = info_pre_processed(dfa)

In [16]:
dfc = info_processed(dfb)

In [17]:
dfc

Unnamed: 0,type,id,Protein name,Gene,PE,Mutation,sequence
0,sp,A0A0B4J2A2,Peptidyl-prolyl cis-trans isomerase A-like 4C,PPIAL4C,PE=2,SV=1,MVNSVVFFDITVDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
1,tr,A0A1B0GTG8,Uncharacterized protein,LOC105372440,PE=4,SV=1,MWPLWHAPSSGEANVTLAMALFTILTSIYFFNKAQQ
2,sp,A1L190,Synaptonemal complex central element protein 3,SYCE3,PE=1,SV=1,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...
3,sp,A6NC57,Ankyrin repeat domain-containing protein 62,ANKRD62,PE=2,SV=4,MEVRGSFLAACRRRMATWRKNRDKDGFSNPGYRVRQKDLGMIHKAA...
4,sp,A6NJR5,Putative speedy protein-like protein 3,5,SV=3,,MQKHYTVAWFLYSAPGVDPSPPCRSLGWKRKKEWSDESEEEPEKEL...
...,...,...,...,...,...,...,...
20593,sp,Q9UBX5,Fibulin-5,FBLN5,PE=1,SV=1,MPGIKRILTVTILALCLPSPGNAQAQCTNGFDLDRQSGQCLDIDEC...
20594,sp,Q9UBY0,Sodium/hydrogen exchanger 2,SLC9A2,PE=2,SV=1,MEPLGNWRSLRAPLPPMLLLLLLQVAGPVGALAETLLNAPRAMGTS...
20595,sp,Q9UER7,Death domain-associated protein 6,DAXX,PE=1,SV=2,MATANSIIVLDDDDEDEAAAQPGPSHPLPNAASPGAEAPSSSEPHG...
20596,sp,Q9UKI9,"POU domain, class 2, transcription factor 3",POU2F3,PE=2,SV=3,MVNLESMHTDIKMSGDVADSTDARSTLSQVEPGNDRNGLDFNRQIK...


In [20]:
df_targets = dfc[['Protein name','sequence']].copy()

df_targets['drug_name'] = df_drugs['Name'][0]
df_targets['SMILES'] = df_drugs['SMILES'][0]

In [25]:
df_targets

Unnamed: 0,Protein name,sequence,drug_name,SMILES
0,Peptidyl-prolyl cis-trans isomerase A-like 4C,MVNSVVFFDITVDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
1,Uncharacterized protein,MWPLWHAPSSGEANVTLAMALFTILTSIYFFNKAQQ,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
2,Synaptonemal complex central element protein 3,MDDADPEERNYDNMLKMLSDLNKDLEKLLEEMEKISVQATWMAYDM...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
3,Ankyrin repeat domain-containing protein 62,MEVRGSFLAACRRRMATWRKNRDKDGFSNPGYRVRQKDLGMIHKAA...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
4,Putative speedy protein-like protein 3,MQKHYTVAWFLYSAPGVDPSPPCRSLGWKRKKEWSDESEEEPEKEL...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
...,...,...,...,...
20593,Fibulin-5,MPGIKRILTVTILALCLPSPGNAQAQCTNGFDLDRQSGQCLDIDEC...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
20594,Sodium/hydrogen exchanger 2,MEPLGNWRSLRAPLPPMLLLLLLQVAGPVGALAETLLNAPRAMGTS...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
20595,Death domain-associated protein 6,MATANSIIVLDDDDEDEAAAQPGPSHPLPNAASPGAEAPSSSEPHG...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
20596,"POU domain, class 2, transcription factor 3",MVNLESMHTDIKMSGDVADSTDARSTLSQVEPGNDRNGLDFNRQIK...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...


In [12]:
dfx = dfa.copy() # create a copy of df
list_of_info = list(dfx['info']) # convert column values to list
list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
dfx['temp_col'] = list_of_info # create a temporary column from the processed list
dfx.drop("info", axis=1, inplace=True) # drop info column
#split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # 
#df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
#df = pd.concat([df, split_df], axis=1) # concatenating both 

dfx


Unnamed: 0,type,id,temp_col
0,sp,A0A0B4J2A2,[ Peptidyl-prolyl cis-trans isomerase A-like 4...
1,tr,A0A1B0GTG8,"[ Uncharacterized protein , OX=9606 GN=LOC105..."
2,sp,A1L190,[ Synaptonemal complex central element protein...
3,sp,A6NC57,[ Ankyrin repeat domain-containing protein 62 ...
4,sp,A6NJR5,"[ Putative speedy protein-like protein 3 , OX..."
...,...,...,...
20593,sp,Q9UBX5,"[ Fibulin-5 , OX=9606 GN=FBLN5 PE=1 SV=1]"
20594,sp,Q9UBY0,"[ Sodium/hydrogen exchanger 2 , OX=9606 GN=SL..."
20595,sp,Q9UER7,"[ Death domain-associated protein 6 , OX=9606..."
20596,sp,Q9UKI9,"[ POU domain, class 2, transcription factor 3 ..."


In [13]:
dfy = dfx.copy()
split_df = pd.DataFrame(dfy['temp_col'].tolist(), columns=['Protein name', 'info']) # 

dfy

Unnamed: 0,type,id,temp_col
0,sp,A0A0B4J2A2,[ Peptidyl-prolyl cis-trans isomerase A-like 4...
1,tr,A0A1B0GTG8,"[ Uncharacterized protein , OX=9606 GN=LOC105..."
2,sp,A1L190,[ Synaptonemal complex central element protein...
3,sp,A6NC57,[ Ankyrin repeat domain-containing protein 62 ...
4,sp,A6NJR5,"[ Putative speedy protein-like protein 3 , OX..."
...,...,...,...
20593,sp,Q9UBX5,"[ Fibulin-5 , OX=9606 GN=FBLN5 PE=1 SV=1]"
20594,sp,Q9UBY0,"[ Sodium/hydrogen exchanger 2 , OX=9606 GN=SL..."
20595,sp,Q9UER7,"[ Death domain-associated protein 6 , OX=9606..."
20596,sp,Q9UKI9,"[ POU domain, class 2, transcription factor 3 ..."


In [14]:

dfz = dfy.copy()
dfz.drop("temp_col", axis=1, inplace=True) # dropping temp_col
dfz = pd.concat([dfz, split_df], axis=1) # concatenating both 
dfz

Unnamed: 0,type,id,Protein name,info
0,sp,A0A0B4J2A2,Peptidyl-prolyl cis-trans isomerase A-like 4C,OX=9606 GN=PPIAL4C PE=2 SV=1
1,tr,A0A1B0GTG8,Uncharacterized protein,OX=9606 GN=LOC105372440 PE=4 SV=1
2,sp,A1L190,Synaptonemal complex central element protein 3,OX=9606 GN=SYCE3 PE=1 SV=1
3,sp,A6NC57,Ankyrin repeat domain-containing protein 62,OX=9606 GN=ANKRD62 PE=2 SV=4
4,sp,A6NJR5,Putative speedy protein-like protein 3,OX=9606 PE=5 SV=3
...,...,...,...,...
20593,sp,Q9UBX5,Fibulin-5,OX=9606 GN=FBLN5 PE=1 SV=1
20594,sp,Q9UBY0,Sodium/hydrogen exchanger 2,OX=9606 GN=SLC9A2 PE=2 SV=1
20595,sp,Q9UER7,Death domain-associated protein 6,OX=9606 GN=DAXX PE=1 SV=2
20596,sp,Q9UKI9,"POU domain, class 2, transcription factor 3",OX=9606 GN=POU2F3 PE=2 SV=3


In [26]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("human_proteome.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

clean_proteins =(protein_df.
                 pipe(info_parser).
                 pipe(info_pre_processed).
                 pipe(info_processed))

df_targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

df_targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
df_targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

target_name = df_targets['Protein name'].tolist()
target = df_targets.sequence.tolist()
drug_name = df_targets.drug_name.tolist()
drug = df_targets.SMILES.tolist()

df_targets.to_csv('proteome_clean.csv') # save dataframe as csv