In [52]:
import pandas as pd
from Bio.SeqIO.FastaIO import SimpleFastaParser

fasta = open("human_proteome.fasta")
df_drugs = pd.read_csv('cann_mols.csv')  

fasta_list = list(SimpleFastaParser(fasta))

protein_df = pd.DataFrame(fasta_list, columns=['info', 'sequence'])

def string_splitter(string):
    string = string.split("HUMAN",1)[1]
    string = string.split("OS=Homo sapiens")
    
    return string

def info_parser(dfx):
    df = dfx.copy()
    df = df['info'].str.split('|', expand=True) # split on the "|" character
    df.columns = ['type', 'id', 'info'] # rename the three columns

    return df

def info_pre_processed(dfx):
    df = dfx.copy() # create a copy of df
    list_of_info = list(dfx['info']) # convert column values to list
    list_of_info = [string_splitter(x) for x in list_of_info] # apply string_splitter function to list elements
    df['temp_col'] = list_of_info # create a temporary column from the processed list
    df.drop("info", axis=1, inplace=True) # drop info column
    split_df = pd.DataFrame(df['temp_col'].tolist(), columns=['Protein name', 'info']) # creating this dataframe to merge back onto processed dataframe
    df.drop("temp_col", axis=1, inplace=True) # dropping temp_col
    df = pd.concat([df, split_df], axis=1) # merging both dataframes

    return df
    
def info_processed(dfx):
    df = dfx.copy() # create a copy of the dataframe
    split_df = df['info'].str.split(' ', expand=True) # creating a seperate dataframe to merge with
    df = pd.concat([df, split_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.columns = ['type', 'id', 'Protein name', 'drop', 'Species', 'Gene', 'PE', 'Mutation'] # rename columns
    df.drop("drop", axis=1, inplace=True) # drop empty column
    df = pd.concat([df, protein_df], axis=1) # merge dataframes
    df.drop("info", axis=1, inplace=True) # drop info column
    df.drop('Species', axis=1, inplace=True) # drop column
    df['Gene'] = df['Gene'].str[3:] # strip first 3 characters
    
    return df

clean_proteins =(protein_df.
                 pipe(info_parser).
                 pipe(info_pre_processed).
                 pipe(info_processed))

df_targets = clean_proteins[['Protein name','sequence']].copy() # making a copy of the clean dataframe.

df_targets['drug_name'] = df_drugs['Name'][0] # adding a column with a constant value of "cannaflavin a".
df_targets['SMILES'] = df_drugs['SMILES'][0] # adding a column with a constant value of "cannaflavin a" SMILES string.

target_name = df_targets['Protein name'].tolist()
target = df_targets.sequence.tolist()
drug_name = df_targets.drug_name.tolist()
drug = df_targets.SMILES.tolist()

df_targets.to_csv('proteome_clean.csv') # save dataframe as csv

In [2]:
f = open('virtual_screening_Transformer_CNN_BindingDB.txt', 'r')
file_contents = f.read()

In [9]:
table_list = [
    [item.strip() for item in line.split('|') if item]  # maintain the number of columns in rows.
    for line in file_contents.strip().split('\n')
    if '+-' not in line  # discard +-
]

df = pd.DataFrame(table_list, columns = ['Rank', 'Drug Name', 'Target Name', 'Binding Score'])
df = df.iloc[1: , :]
df['Binding Score'] = df['Binding Score'].astype(float)

In [24]:
df_top_1000 = df.nlargest(1000,'Binding Score')

In [27]:
df_top_1000.to_csv('df_top_1000.csv')

In [25]:
df_counts = df_top_1000['Target Name'].value_counts()

In [28]:
df_counts.head(20)

P20309    8
P25100    8
P08912    8
P35372    8
P21554    8
P41143    8
P21917    8
Q16790    8
P28221    8
O43603    8
P35367    8
Q9Y5N1    8
Q08345    8
P29275    8
P30989    7
P30559    7
P41146    7
P08173    7
Q9Y5Y4    7
Q04609    7
Name: Target Name, dtype: int64

In [30]:
len(set(df_top_1000['Target Name'].tolist()))

312

In [53]:
trans_cnn_bdb = 'virtual_screening_Transformer_CNN_BindingDB'
mpnn_cnn_davis = 'virtual_screening_MPNN_CNN_DAVIS'
morgan_cnn_bdb_IC50 = 'virtual_screening_Morgan_CNN_BindingDB_IC50'
morgan_cnn_bdb = 'virtual_screening_Morgan_CNN_BindingDB'
cnn_cnn_bdb = 'virtual_screening_CNN_CNN_BindingDB_IC50'

In [77]:
def results_reader(result_set):
    f = open(result_set+'.txt', 'r')
    file_contents = f.read()

    table_list = [
    [item.strip() for item in line.split('|') if item]  # maintain the number of columns in rows.
    
    for line in file_contents.strip().split('\n')
    if '+-' not in line  # discard +-
    ]
        
    df = pd.DataFrame(table_list, columns = ['Rank', 'Drug Name', 'Target Name', 'Binding Score'])
    df = df.iloc[1: , :]
    df['Binding Score'] = df['Binding Score'].astype(float)
    
    df_top_20k = df.nlargest(20000,'Binding Score')
    
    #df_top_1000.to_csv('df_top_1000_'+result_set+'.csv')
    
    return df_top_20k

In [78]:
df_virtual_screen_1 = results_reader(trans_cnn_bdb)
df_virtual_screen_2 = results_reader(mpnn_cnn_davis)
df_virtual_screen_3 = results_reader(morgan_cnn_bdb_IC50)
df_virtual_screen_4 = results_reader(morgan_cnn_bdb)
df_virtual_screen_5 = results_reader(cnn_cnn_bdb)

In [69]:
virtual_screen_5000 = pd.concat([df_virtual_screen_1, 
                                 df_virtual_screen_2, 
                                 df_virtual_screen_3, 
                                 df_virtual_screen_4, 
                                 df_virtual_screen_5,], axis=0)

In [79]:
virtual_screen_100k = pd.concat([df_virtual_screen_1, 
                                   df_virtual_screen_2, 
                                   df_virtual_screen_3, 
                                   df_virtual_screen_4, 
                                   df_virtual_screen_5,], axis=0)

In [65]:
virtual_screen_5000

Unnamed: 0,Rank,Drug Name,Target Name,Binding Score
1,1,Cannflavin C,P20309,9.46
2,2,Cannflavin C,P35372,9.38
3,3,Quercetin,P20309,9.31
4,4,Luteolin,P20309,9.25
5,5,Cannflavin C,O94804,9.21
...,...,...,...,...
9996,9996,Cannflavin B,P01861,6.37
9997,9997,Cannflavin C,Q8N9V3,6.37
9998,9998,Cannflavin A,Q9HD64,6.37
9999,9999,Cannflavin B,Q969S8,6.37


In [98]:
def protein_filter(df):
    hesper_df = df[df['Drug Name'] == 'Hesperetin']
    cann_a_df = df[df['Drug Name'] == 'Cannflavin A']
    cann_b_df = df[df['Drug Name'] == 'Cannflavin B']
    cann_c_df = df[df['Drug Name'] == 'Cannflavin C']
    Cannip_df = df[df['Drug Name'] == 'Canniprene']
    Querce_df = df[df['Drug Name'] == 'Quercetin']
    Luteol_df = df[df['Drug Name'] == 'Luteolin']
    prenyl_df = df[df['Drug Name'] == '6-prenylnaringenin']
    Cannab_df = df[df['Drug Name'] == 'Cannabistilbene I']
    
    hesper_counts = (hesper_df['Target Name'].value_counts()).head(20).to_frame()
    cann_a_counts = (cann_a_df['Target Name'].value_counts()).head(20).to_frame()
    cann_b_counts = (cann_b_df['Target Name'].value_counts()).head(20).to_frame()
    cann_c_counts = (cann_c_df['Target Name'].value_counts()).head(20).to_frame()
    Cannip_counts = (Cannip_df['Target Name'].value_counts()).head(20).to_frame()
    Querce_counts = (Querce_df['Target Name'].value_counts()).head(20).to_frame()
    Luteol_counts = (Luteol_df['Target Name'].value_counts()).head(20).to_frame()
    prenyl_counts = (prenyl_df['Target Name'].value_counts()).head(20).to_frame()
    Cannab_counts = (Cannab_df['Target Name'].value_counts()).head(20).to_frame()
    #hesper_counts = hesper_counts.head(20).to_frame()
    #hesper_counts_final = pd.DataFrame(hesper_counts, columns=['Protein', 'Count'])
    proteins_180 = pd.concat([hesper_counts,
                              cann_a_counts,
                              cann_b_counts,
                              cann_c_counts,
                              Cannip_counts,
                              Querce_counts,
                              Luteol_counts,
                              prenyl_counts,
                              Cannab_counts
                             ], axis=0) # merging both dataframes
    return proteins_180

In [112]:
test = protein_filter(virtual_screen_100k)

In [113]:
test = test.reset_index()

In [116]:
test.rename(columns = {'index':'Protein', 'Target Name':'Count'}, inplace = True)

In [118]:
len(set(test.Protein.tolist()))

101

In [119]:
test.Protein.value_counts()

P25103    7
Q13639    7
Q16790    6
Q9Y5N1    6
P0DMS8    6
         ..
Q16620    1
P09874    1
P35462    1
Q17RY6    1
P49286    1
Name: Protein, Length: 101, dtype: int64

In [59]:
virtual_screen_5000.groupby('Target Name')['Drug Name'].value_counts()

Target Name  Drug Name   
A0A024RCN7   Cannflavin C    1
A0A075B6I1   Cannflavin C    1
A0A075B6K5   Cannflavin C    1
A0A075B6Q5   Cannflavin A    1
A0A075B6S6   Canniprene      1
                            ..
Q9Y6F6       Cannflavin C    1
             Canniprene      1
Q9Y6K8       Hesperetin      1
Q9Y6N6       Hesperetin      1
Q9Y6Q5       Hesperetin      1
Name: Drug Name, Length: 4478, dtype: int64

In [60]:
counts = virtual_screen_5000['Drug Name'].value_counts()

In [70]:
counts_100k = virtual_screen_5000['Drug Name'].value_counts()
counts_100k

Hesperetin            22410
Cannflavin C          16871
Luteolin              16706
Quercetin             15628
Cannflavin A           9596
6-prenylnaringenin     6656
Cannflavin B           6524
Canniprene             3297
Cannabistilbene I      2312
Name: Drug Name, dtype: int64

In [66]:
counts_50k = virtual_screen_5000['Drug Name'].value_counts()
counts_50k

Hesperetin            10303
Cannflavin C           8958
Luteolin               8786
Quercetin              7436
Cannflavin A           4894
6-prenylnaringenin     4836
Cannflavin B           2789
Canniprene             1174
Cannabistilbene I       824
Name: Drug Name, dtype: int64

In [61]:
counts

Cannflavin C          1170
Hesperetin             985
Luteolin               683
Quercetin              630
Cannflavin A           551
Canniprene             517
Cannflavin B           222
6-prenylnaringenin     181
Cannabistilbene I       61
Name: Drug Name, dtype: int64

In [72]:
hesp_df = virtual_screen_5000[virtual_screen_5000['Drug Name'] == 'Hesperetin']
hesp_df

Unnamed: 0,Rank,Drug Name,Target Name,Binding Score
13,13,Hesperetin,P35372,9.06
18,18,Hesperetin,P20309,8.99
27,27,Hesperetin,Q9Y5N1,8.89
42,42,Hesperetin,P41143,8.80
45,45,Hesperetin,P28221,8.79
...,...,...,...,...
19318,19318,Hesperetin,P31213,6.15
19375,19375,Hesperetin,P61204,6.15
19504,19504,Hesperetin,P00918,6.14
19651,19651,Hesperetin,P00491,6.14


In [73]:
hesp_counts = hesp_df['Target Name'].value_counts()

In [76]:
hesp_counts.head(20)

P21453    5
P08908    5
O43614    5
O43193    4
Q92847    4
Q99572    4
P46089    4
P25103    4
P23975    4
Q15761    4
Q13639    4
P32238    4
Q6W5P4    4
P21452    4
Q01959    4
P26678    4
Q99720    4
P34998    4
P0DMS8    4
Q13093    4
Name: Target Name, dtype: int64

In [126]:
import openpyxl

xls = pd.ExcelFile('uniprot_diseases_biomarkers.xlsx') 
sheets_names = ['Inflammation', 
                'Autoinflammation', 
                'Chronic Inflammation', 
                'Rheumatoid Arthritis', 
                'IBD', 
                'Cardiovascular', 
                'Influenza', 
                'COVID-19']
dfs = []
for sheet_name in sheets_names: 
    df[i] = pd.read_excel(xls, sheet_name)

NameError: name 'i' is not defined

In [128]:
import pandas as pd
xlsx = pd.read_excel('uniprot_diseases_biomarkers.xlsx', sheet_name=None, header=None)
for sheet in xlsx.keys(): xlsx[sheet].to_excel(sheet+'.xlsx', header=False, index=False)

In [141]:
df_inflammation = xlsx['Inflammation']

In [142]:
df_inflammation.columns = df_inflammation.iloc[0]
df_inflammation = df_inflammation[1:]

In [143]:
df_inflammation

Unnamed: 0,Entry,Reviewed,Entry Name,Protein names,Gene Names,Interacts with,Gene Ontology (biological process),Involvement in disease
1,A1A4Y4,reviewed,IRGM_HUMAN,"Immunity-related GTPase family M protein, EC 3...",IRGM IFI1 IRGM1 LRG47,Q96CV9,autophagosome assembly [GO:0000045]; CAMKK-AMP...,DISEASE: Inflammatory bowel disease 19 (IBD19)...
2,O00187,reviewed,MASP2_HUMAN,"Mannan-binding lectin serine protease 2, EC 3....",MASP2,P11226; P05155; PRO_0000042699 [P0C0L5]; P4874...,cell surface pattern recognition receptor sign...,DISEASE: MASP2 deficiency (MASPD) [MIM:613791]...
3,O00329,reviewed,PK3CD_HUMAN,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",PIK3CD,P01112; P27986; P27986-2; Q92569; P01112,adaptive immune response [GO:0002250]; B cell ...,"DISEASE: Immunodeficiency 14A, autosomal domin..."
4,O15143,reviewed,ARC1B_HUMAN,Actin-related protein 2/3 complex subunit 1B (...,ARPC1B ARC41,Q5ZTM4,Arp2/3 complex-mediated actin nucleation [GO:0...,DISEASE: Immunodeficiency 71 with inflammatory...
5,O15484,reviewed,CAN5_HUMAN,"Calpain-5, EC 3.4.22.- (Calpain htra-3) (New c...",CAPN5 NCL3,,proteolysis [GO:0006508]; signal transduction ...,"DISEASE: Vitreoretinopathy, neovascular inflam..."
...,...,...,...,...,...,...,...,...
126,Q9Y244,reviewed,POMP_HUMAN,Proteasome maturation protein (Proteassemblin)...,POMP C13orf12 UMP1 HSPC014 HSPC036 PNAS-110,Q15777; P20618; P49720; P28074; Q99436; P28065,proteasome assembly [GO:0043248],DISEASE: Keratosis linearis with ichthyosis co...
127,Q9Y2G2,reviewed,CARD8_HUMAN,Caspase recruitment domain-containing protein ...,CARD8 DACAR KIAA0955 NDPP1,,activation of cysteine-type endopeptidase acti...,DISEASE: Inflammatory bowel disease 30 (IBD30)...
128,Q9Y2R2,reviewed,PTN22_HUMAN,Tyrosine-protein phosphatase non-receptor type...,PTPN22 PTPN8,P20963; P00533; P62993; P06239; O43586; O75563...,autophagy [GO:0006914]; cellular response to m...,DISEASE: Systemic lupus erythematosus (SLE) [M...
129,Q9Y5S8,reviewed,NOX1_HUMAN,"NADPH oxidase 1, NOX-1, EC 1.-.-.- (Mitogenic ...",NOX1 MOX1 NOH1,Q0VAB0,angiogenesis [GO:0001525]; cell migration [GO:...,DISEASE: Note=Defects in NOX1 may play a role ...
