In [1]:
from collections import Counter
import pandas as pd
import numpy as np
import tqdm
import pickle
import matplotlib.pyplot as plt
from Bio.PDB import *
from rdkit import Chem
import os
import time
import shutil
from Bio.SeqUtils import seq1
import gzip
import tarfile
import sys
import random

In [2]:
def good_ligand(inchi):
    try:
        m = Chem.MolFromInchi(inchi)
        count_carbons = 0
        for i in m.GetAtoms():
            if int(i.GetAtomicNum()) == 6:
                count_carbons += 1
            if count_carbons >= 7:
                break
        return count_carbons >= 7
    except:
        return False
    

# Natural amino acids
nat_aa = {'ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS', 'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 'TYR', 'VAL'}
maestro_blacklist = set([i.strip() for i in pd.read_csv("/aloy/home/acomajuncosa/PocketVec_v2/kinase/PDB/LIG/preprocess/maestro_blacklist.csv")[' Residue']])
# uninteresting_ligands = pickle.load(open("/aloy/home/acomajuncosa/Apps_PocketVec/HT/PDB/uninteresting_ligands.pkl", "rb")) ## Not using it, for the moment

In [34]:
### GET INTERESTING LIGANDS

pdbcode_to_inchi = pd.read_csv("../data/PDB/mapping/Components-inchi.ich.txt", sep="\t", header=None, names=['inchi', 'PDB', 'name'], usecols=[0,1])  
# From df to dict's  // PDB-LIGs
d = {}
for i,j in zip(pdbcode_to_inchi['inchi'], pdbcode_to_inchi['PDB']):
    if str(j) == 'nan': j = "NA"
    d[str(j)] = i
pdbcode_to_inchi = d; del d

interesting_ligands = set()

for lig in tqdm.tqdm(sorted(pdbcode_to_inchi)):
    inchi = pdbcode_to_inchi[lig]
    if "-" not in lig and lig not in maestro_blacklist and lig not in nat_aa and good_ligand(inchi) is True:
        interesting_ligands.add(lig)

  0%|          | 183/37360 [00:00<00:20, 1823.05it/s]RDKit ERROR: [20:19:35] ERROR: Explicit valence for atom # 19 C, 5, is greater than permitted
RDKit ERROR: [20:25:43] Explicit valence for atom # 27 Be, 4, is greater than permitted
  3%|▎         | 1039/37360 [00:00<00:22, 1650.78it/s]RDKit ERROR: [20:25:43] ERROR: Explicit valence for atom # 27 Be, 4, is greater than permitted
RDKit ERROR: [20:25:43] Explicit valence for atom # 15 C, 6, is greater than permitted
RDKit ERROR: [20:25:43] ERROR: Explicit valence for atom # 15 C, 6, is greater than permitted
RDKit ERROR: [20:25:43] Explicit valence for atom # 15 C, 6, is greater than permitted
  4%|▎         | 1392/37360 [00:00<00:21, 1682.13it/s]RDKit ERROR: [20:25:43] ERROR: Explicit valence for atom # 15 C, 6, is greater than permitted
RDKit ERROR: [20:25:44] Explicit valence for atom # 16 N, 5, is greater than permitted
  5%|▍         | 1727/37360 [00:01<00:21, 1647.66it/s]RDKit ERROR: [20:25:44] ERROR: Explicit valence for atom # 

In [35]:
len(interesting_ligands)

33133

In [93]:
### FIND INTERESTING PDBS

pdbs = {}

with open("../data/PDB/mapping/lig_pairs.lst.txt", "r") as f:
    for l in f:
        pdb = l.split(":")[0].strip()
        ligs = [i.strip() for i in l.split(":")[1].split(";")[:-1]]
        ligs = [i for i in ligs if i in interesting_ligands]
        if len(ligs) > 0:
            pdbs[pdb] = ligs

In [95]:
len(pdbs)

72576

In [83]:
ligs

[]

In [84]:
pdbs['7t46']

['F8C']

In [85]:
"F5L" in interesting_ligands

True