# Drug-drug similarity for ctd dataset
- https://www.nature.com/articles/nprot.2014.151


In [None]:
import csv
import subprocess
import re
import os
import pandas as pd
from itertools import chain
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt
import pickle

In [None]:
T_CUTOFF = 0 # Specify the TC cutoff. This option is useful if only the TCs of similar molecules above the established cutoff are needed. Otherwise, set T_CUTOFF=0 to provide all TC pair values.
FINGERPRINT = 'MACCS' # Specify fingerprint (‘FP2’, ‘FP3’, ‘FP4’ or ‘MACCS’)
MAINDIR = ''

In [None]:
def sim_one_ChEMBL(FINGERPRINT, T_CUTOFF): 
    """
    This function calculates similarity of a drug with the other drugs 
    
    Args: 
        FINGERPRINT: Fingerprint (‘FP2’, ‘FP3’, ‘FP4’ or ‘MACCS’)
        T_CUTOFF: The established cutoff

    Returns: A list of pairs of compounds and the relevant TC that quantifies the level of similarity between them.
    """
    
    input_temp = open(f'{MAINDIR}/temp_SMILES.txt', 'r')
    print('pass-1')

    # Create a dictionary of chemicals to be compared:
    input_dict = dict()

    # Read the input and the files previously created:
    for line in input_temp:
        newline = line.split()
        if len(newline) != 2:
            continue
        smiles = newline[0]
        id = newline[1]
        input_dict[id] = smiles

    input_temp.close()

    # Open the results file (.csv file):
    f = open(f'{MAINDIR}/TC_results.csv', 'w')
    writer = csv.writer(f)
    writer.writerow(['chemical1', 'chemical2', 'TC'])

    # For each chemical in input list, calculate the TC between that chemical and all other chemicals in the
    # input list using Open Babel:
    for chemical1 in tqdm(input_dict):
        babel_command = 'obabel -ismi -:"%s" /temp_SMILES.txt -ofpt -xf%s' %(input_dict[chemical1], FINGERPRINT)
        output = subprocess.Popen(babel_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Read and parse output from Open Babel:
    TC_list = []
    TC_list_tmp = []

    while True:
        line = output.stdout.readline().decode('ascii')
        if line != '':
            newline = re.split('>|=', line)
           

            if len(newline) > 2:
                id_catcher = newline[1].split()
                chemical2 = id_catcher[0]
                TC = float(newline[2])
                TC_list.append((chemical2, TC))
        else:
            break

    # Write the TCs exceeding the cutoff to the output file
    for chemical2, TC in TC_list:
        if TC >= T_CUTOFF and chemical1 != chemical2:
            writer.writerow([chemical1, chemical2, TC])
            TC_list_tmp.append((chemical1, chemical2, TC))
            
    del TC_list
    f.close()
    
    return TC_list_tmp

In [None]:
df_SMILES_main = pd.read_csv(f'{MAINDIR}/temp_SMILES_main.txt', sep='\t', header=None)
# a copy of all SMILES. It will be updated to create the DDS matrix
df_SMILES_main.to_csv(f'{MAINDIR}/temp_SMILES.txt',sep='\t', index=False, header=False)
SMILES_id = df_SMILES_main[0].to_list()[::-1]
print(f"Number of drug: {len(SMILES_id)}\n")

df_SMILES_main.head()

In [None]:
i = 0
TC_list_final = [] # All drug-drug similarity
for c in SMILES_id[:-1]:
    i += 1
    TC_list_tmp = sim_one_ChEMBL(FINGERPRINT, T_CUTOFF)
    TC_list_final.append(TC_list_tmp)
    
    # After calculating the drug (c) similarity, it will be removed from list (we have already added the similarity of durg (c) with all drugs) 
    df = pd.read_csv(f'{MAINDIR}/temp_SMILES.txt', skipfooter = 1, engine='python',sep='\t', header=None)
    df.to_csv(f'{MAINDIR}/temp_SMILES.txt',sep='\t', index=False, header=False)
    print(i)
    
    # Because the data is big, it is collected gradually 
    if (i % 100 == 0) | (i>len(SMILES_id)-10):
        df_TC_list_final = pd.DataFrame(list(chain(*TC_list_final)))
        df_TC_list_final[0] = df_TC_list_final[0].str.replace('CHEMBL', '')
        df_TC_list_final[1] = df_TC_list_final[1].str.replace('CHEMBL', '')
        df_TC_list_final.to_csv(f'{MAINDIR}/DDS_known_ChEMBLid{i}_T{len(SMILES_id)}.csv')
        #df_TC_list_final.to_csv(f'/data/Elmira_Data/ChEMBL/DDS_known_ChEMBLid{i}_T{len(SMILES_id)}.txt',sep='\t', index=False, header=False)
        #TC_list_final.clear()

In [None]:
# reading the last file has all similarities
DDS_tmp = pd.read_csv(f'{MAINDIR}/DDS_known_ChEMBLid289_T290.csv', index_col=0)
DDS_tmp.head()

In [None]:
DDS = DDS_tmp.rename(columns={'0':'from','1':'to', '2':'weight'})
with open(f"{MAINDIR}/DDS.pkl", "wb") as fp:
    pickle.dump(DDS, fp)