In [32]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("uniprot_200k.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns # rename the three columns

    return df

def info_parser_200k(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info', 'none'] # rename the three columns # rename the three columns
    df.drop("none", axis=1, inplace=True) # drop info column

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

def info_processed_200k(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df = df.drop(df.columns[[0, 3, 4, 6, 7, 8, 9, 10, 11, 12]],axis = 1)

    df.columns = ['id', 'Protein name', 'Gene'] # rename columns
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column

    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

#clean_proteins = info_parser_200k(protein_df)
                 #pipe(info_parser).
                 #pipe(info_pre_processed).
                 #pipe(info_processed))
            
clean_proteins = (protein_df.
                 pipe(info_parser_200k).
                 pipe(info_pre_processed).
                 pipe(info_processed_200k))

df_targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

df_targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
df_targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

df_targets_2 = df_targets.copy()
df_targets_2['drug_name'] = df_drugs['Name'][1]
df_targets_2['SMILES'] = df_drugs['SMILES'][1]

df_targets_3 = df_targets.copy()
df_targets_3['drug_name'] = df_drugs['Name'][2]
df_targets_3['SMILES'] = df_drugs['SMILES'][2]

df_targets_4 = df_targets.copy()
df_targets_4['drug_name'] = df_drugs['Name'][3]
df_targets_4['SMILES'] = df_drugs['SMILES'][3]

df_targets_5 = df_targets.copy()
df_targets_5['drug_name'] = df_drugs['Name'][4]
df_targets_5['SMILES'] = df_drugs['SMILES'][4]

df_targets_6 = df_targets.copy()
df_targets_6['drug_name'] = df_drugs['Name'][5]
df_targets_6['SMILES'] = df_drugs['SMILES'][5]

df_targets_7 = df_targets.copy()
df_targets_7['drug_name'] = df_drugs['Name'][6]
df_targets_7['SMILES'] = df_drugs['SMILES'][6]

df_targets_8 = df_targets.copy()
df_targets_8['drug_name'] = df_drugs['Name'][7]
df_targets_8['SMILES'] = df_drugs['SMILES'][7]

df_targets_9 = df_targets.copy()
df_targets_9['drug_name'] = df_drugs['Name'][8]
df_targets_9['SMILES'] = df_drugs['SMILES'][8]

targets = pd.concat([df_targets, 
                  df_targets_2, 
                  df_targets_3, 
                  df_targets_4, 
                  df_targets_5, 
                  df_targets_6, 
                  df_targets_7, 
                  df_targets_8, 
                  df_targets_9], axis=0)

target_name = targets['Protein name'].tolist()
target = targets.sequence.tolist()
drug_name = targets.drug_name.tolist()
drug = targets.SMILES.tolist()

targets.to_csv('targets_pre_screen.csv')
clean_proteins.to_csv('clean_proteins.csv')

In [31]:
len(target_name)

1844154

In [28]:
clean_proteins
df_targets

Unnamed: 0,Protein name,sequence
0,"T-box 2, isoform CRA_a",MREPALAASAMAYHPFHAPRPADFPMSAFLAAAQPSFFPALALPPG...
1,Receptor protein-tyrosine kinase,MELQAARACFALLWGCALAAAAAAQGKEVVLLDFAAAGGELGWLTH...
2,Battenin,MGGCAGSRRRFSDSEGEETVPEPRLPLLDHQGAHWKNAVGFWLLGL...
3,Sirtuin (Silent mating type information regul...,MADEAALALQPGGSPSAAGADREAASSPAGEPLRKRPRRDGPGLER...
4,Phospholipid phosphatase 1,MFDKTRLPYVALDVLCVLLAGLPFAILTSRHTPFQRGVFCNDESIK...
...,...,...
204901,Probable ATP-dependent RNA helicase DDX5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...
204902,ELKS/Rab6-interacting/CAST family member 1,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...
204903,Voltage-dependent calcium channel subunit alp...,MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...
204904,ELKS/Rab6-interacting/CAST family member 1,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...


In [None]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("uniprot_200k.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns # rename the three columns

    return df



def info_parser_200k(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info', 'none'] # rename the three columns # rename the three columns
    df.drop("none", axis=1, inplace=True) # drop info column

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

def info_processed_200k(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df = df.drop(df.columns[[0, 3, 4, 6, 7, 8, 9, 10, 11, 12]],axis = 1)

    df.columns = ['id', 'Protein name', 'Gene'] # rename columns
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column

    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

#clean_proteins = info_parser_200k(protein_df)
                 #pipe(info_parser).
                 #pipe(info_pre_processed).
                 #pipe(info_processed))
            
clean_proteins = (protein_df.
                 pipe(info_parser_200k).
                 pipe(info_pre_processed).
                 pipe(info_processed_200k))

df_targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

df_targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
df_targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

df_targets_2 = df_targets.copy()
df_targets_2['drug_name'] = df_drugs['Name'][1]
df_targets_2['SMILES'] = df_drugs['SMILES'][1]

df_targets_3 = df_targets.copy()
df_targets_3['drug_name'] = df_drugs['Name'][2]
df_targets_3['SMILES'] = df_drugs['SMILES'][2]

df_targets_4 = df_targets.copy()
df_targets_4['drug_name'] = df_drugs['Name'][3]
df_targets_4['SMILES'] = df_drugs['SMILES'][3]

df_targets_5 = df_targets.copy()
df_targets_5['drug_name'] = df_drugs['Name'][4]
df_targets_5['SMILES'] = df_drugs['SMILES'][4]

df_targets_6 = df_targets.copy()
df_targets_6['drug_name'] = df_drugs['Name'][5]
df_targets_6['SMILES'] = df_drugs['SMILES'][5]

df_targets_7 = df_targets.copy()
df_targets_7['drug_name'] = df_drugs['Name'][6]
df_targets_7['SMILES'] = df_drugs['SMILES'][6]

df_targets_8 = df_targets.copy()
df_targets_8['drug_name'] = df_drugs['Name'][7]
df_targets_8['SMILES'] = df_drugs['SMILES'][7]

df_targets_9 = df_targets.copy()
df_targets_9['drug_name'] = df_drugs['Name'][8]
df_targets_9['SMILES'] = df_drugs['SMILES'][8]

targets = pd.concat([df_targets, 
                  df_targets_2, 
                  df_targets_3, 
                  df_targets_4, 
                  df_targets_5, 
                  df_targets_6, 
                  df_targets_7, 
                  df_targets_8, 
                  df_targets_9], axis=0)

target_name = targets['Protein name'].tolist()
target = targets.sequence.tolist()
drug_name = targets.drug_name.tolist()
drug = targets.SMILES.tolist()

targets = pd.read_csv('targets_pre_screen.csv')
clean_proteins.to_csv('clean_proteins.csv')

In [None]:
import pandas as pd
from DeepPurpose import DTI as models
targets = pd.read_csv('targets_pre_screen.csv')

target_name = targets['Protein name'].tolist()
target = targets.sequence.tolist()
drug_name = targets.drug_name.tolist()
drug = targets.SMILES.tolist()

# Virtual screening using the trained model or pre-trained model 

net = models.model_pretrained(model = 'Morgan_AAC_BindingDB_IC50')

_ = models.virtual_screening(drug, target, net, drug_name, target_name)

In [35]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("uniprot_200k.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns # rename the three columns

    return df



def info_parser_200k(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info', 'none'] # rename the three columns # rename the three columns
    df.drop("none", axis=1, inplace=True) # drop info column

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

def info_processed_200k(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df = df.drop(df.columns[[0, 3, 4, 6, 7, 8, 9, 10, 11, 12]],axis = 1)

    df.columns = ['id', 'Protein name', 'Gene'] # rename columns
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column

    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

#clean_proteins = info_parser_200k(protein_df)
                 #pipe(info_parser).
                 #pipe(info_pre_processed).
                 #pipe(info_processed))
            
clean_proteins = (protein_df.
                 pipe(info_parser_200k).
                 pipe(info_pre_processed).
                 pipe(info_processed_200k))

targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

target_name = targets['Protein name'].tolist()
target = targets.sequence.tolist()
drug_name = targets.drug_name.tolist()
drug = targets.SMILES.tolist()

targets.to_csv('targets_pre_screen_cann_a.csv')
#clean_proteins.to_csv('clean_proteins.csv')

Unnamed: 0,Protein name,sequence,drug_name,SMILES
0,"T-box 2, isoform CRA_a",MREPALAASAMAYHPFHAPRPADFPMSAFLAAAQPSFFPALALPPG...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
1,Receptor protein-tyrosine kinase,MELQAARACFALLWGCALAAAAAAQGKEVVLLDFAAAGGELGWLTH...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
2,Battenin,MGGCAGSRRRFSDSEGEETVPEPRLPLLDHQGAHWKNAVGFWLLGL...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
3,Sirtuin (Silent mating type information regul...,MADEAALALQPGGSPSAAGADREAASSPAGEPLRKRPRRDGPGLER...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
4,Phospholipid phosphatase 1,MFDKTRLPYVALDVLCVLLAGLPFAILTSRHTPFQRGVFCNDESIK...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
...,...,...,...,...
204901,Probable ATP-dependent RNA helicase DDX5,MSGYSSDRDRGRDRGFGAPRFGGSRAGPLSGKKFGNPGEKLVKKKW...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
204902,ELKS/Rab6-interacting/CAST family member 1,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
204903,Voltage-dependent calcium channel subunit alp...,MKLEFLQRKFWAATRQCSTVDGPCTQSCEDSDLDCFVIDNNGFILI...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
204904,ELKS/Rab6-interacting/CAST family member 1,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,Cannflavin A,CC(=CCCC(=CCC1=C(C2=C(C=C1O)OC(=CC2=O)C3=CC(=C...
