In [1]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("human_proteome.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

clean_proteins =(protein_df.
                 pipe(info_parser).
                 pipe(info_pre_processed).
                 pipe(info_processed))

df_targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

df_targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
df_targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

target_name = df_targets['Protein name'].tolist()
target = df_targets.sequence.tolist()
drug_name = df_targets.drug_name.tolist()
drug = df_targets.SMILES.tolist()

df_targets.to_csv('proteome_clean.csv') # save dataframe as csv

In [2]:
f = open('virtual_screening_Transformer_CNN_BindingDB.txt', 'r')
file_contents = f.read()

In [9]:
table_list = [
    [item.strip() for item in line.split('|') if item]  # maintain the number of columns in rows.
    for line in file_contents.strip().split('\n')
    if '+-' not in line  # discard +-
]

df = pd.DataFrame(table_list, columns = ['Rank', 'Drug Name', 'Target Name', 'Binding Score'])
df = df.iloc[1: , :]
df['Binding Score'] = df['Binding Score'].astype(float)

In [24]:
df_top_1000 = df.nlargest(1000,'Binding Score')

In [27]:
df_top_1000.to_csv('df_top_1000.csv')

In [25]:
df_counts = df_top_1000['Target Name'].value_counts()

In [28]:
df_counts.head(20)

P20309    8
P25100    8
P08912    8
P35372    8
P21554    8
P41143    8
P21917    8
Q16790    8
P28221    8
O43603    8
P35367    8
Q9Y5N1    8
Q08345    8
P29275    8
P30989    7
P30559    7
P41146    7
P08173    7
Q9Y5Y4    7
Q04609    7
Name: Target Name, dtype: int64

In [30]:
len(set(df_top_1000['Target Name'].tolist()))

312

In [39]:
trans_cnn_bdb = 'virtual_screening_Transformer_CNN_BindingDB'
mpnn_cnn_davis = 'virtual_screening_MPNN_CNN_DAVIS'
morgan_cnn_bdb_IC50 = 'virtual_screening_Morgan_CNN_BindingDB_IC50'
morgan_cnn_bdb = 'virtual_screening_Morgan_CNN_BindingDB'
cnn_cnn_bdb = 'virtual_screening_CNN_CNN_BindingDB_IC50'

In [43]:
def results_reader(result_set):
    f = open(result_set+'.txt', 'r')
    file_contents = f.read()

    table_list = [
    [item.strip() for item in line.split('|') if item]  # maintain the number of columns in rows.
    
    for line in file_contents.strip().split('\n')
    if '+-' not in line  # discard +-
    ]
        
    df = pd.DataFrame(table_list, columns = ['Rank', 'Drug Name', 'Target Name', 'Binding Score'])
    df = df.iloc[1: , :]
    df['Binding Score'] = df['Binding Score'].astype(float)
    
    df_top_1000 = df.nlargest(1000,'Binding Score')
    
    #df_top_1000.to_csv('df_top_1000_'+result_set+'.csv')
    
    return df_top_1000

In [41]:
df_virtual_screen_1 = results_reader(trans_cnn_bdb)
df_virtual_screen_2 = results_reader(mpnn_cnn_davis)
df_virtual_screen_3 = results_reader(morgan_cnn_bdb_IC50)
df_virtual_screen_4 = results_reader(morgan_cnn_bdb)
df_virtual_screen_5 = results_reader(cnn_cnn_bdb)

In [42]:
virtual_screen_5000 = pd.concat([df_virtual_screen_1, 
                                 df_virtual_screen_2, 
                                 df_virtual_screen_3, 
                                 df_virtual_screen_4, 
                                 df_virtual_screen_5,], axis=0)

In [44]:
virtual_screen_5000

Unnamed: 0,Rank,Drug Name,Target Name,Binding Score
1,1,Cannflavin C,P20309,9.46
2,2,Cannflavin C,P35372,9.38
3,3,Quercetin,P20309,9.31
4,4,Luteolin,P20309,9.25
5,5,Cannflavin C,O94804,9.21
...,...,...,...,...
996,996,Cannflavin C,P13612,6.85
997,997,Cannflavin B,Q8NH73,6.85
998,998,Cannflavin C,P07949,6.85
999,999,Cannflavin A,Q6ZUK4,6.85


In [45]:
virtual_screen_5000.groupby('Drug Name')['Target Name'].value_counts()

Drug Name           Target Name
6-prenylnaringenin  P0DMS8         3
                    P25103         3
                    P30989         3
                    Q2M2I8         3
                    Q9Y5N1         3
                                  ..
Quercetin           Q9Y2U5         1
                    Q9Y463         1
                    Q9Y4K4         1
                    Q9Y5X5         1
                    Q9Y5X9         1
Name: Target Name, Length: 4478, dtype: int64