In [1]:
import pandas as pd

In [None]:
df = pd.read_json("json_files/pdb_bind_12_12_2019.zip")

In [13]:
def parse_pdb_bind(file_name):
    with open(file_name) as pdbbind:
        all_lines = pdbbind.readlines()
        l = [x for x in all_lines if not x.startswith("#")]
        for i in range(len(l)):
            l[i] = l[i].strip().split("  ", 4)
            if len(l[i]) < 5:
                splitted = l[i][-1].split("//")
                if len(splitted) == 2:
                    l[i][-1] = splitted[0].strip()
                    l[i].append(splitted[1].strip())
            try:
                l[i][4] = l[i][4].strip().split(".pdf ")[1]
            except IndexError:
                print(l[i])

    return pd.DataFrame(
        l, columns=["pdb_code", "Resolution", "pub_year", "Value", "lig_name"]
    )

In [14]:
df_NL = parse_pdb_bind("source_files/INDEX_general_NL.2019")
df_PL = parse_pdb_bind("source_files/INDEX_general_PL.2019")
df_PN = parse_pdb_bind("source_files/INDEX_general_PN.2019")
df_PP = parse_pdb_bind("source_files/INDEX_general_PP.2019")

In [15]:
df_PP

Unnamed: 0,pdb_code,Resolution,pub_year,Value,lig_name
0,1fc2,2.80,1981,Kd=22.5nM,"(224-mer) Human Fc fragment, Kd=22.5+/-4.6nM, ..."
1,3sgb,1.80,1983,Kd=17.9pM,"(56-mer) TURKEY OVOMUCOID INHIBITOR (OMTKY3), ..."
2,2tgp,1.90,1983,Kd=2.4uM,"(58-mer) TRYPSIN INHIBITOR, 2.4 x 10-6M"
3,2ptc,1.90,1983,Kd=60fM,(58-mer) Kd=6x10-14M is for trypsin and the pa...
4,2sni,2.10,1988,Kd=2pM,"(83-mer) CHYMOTRYPSIN INHIBITOR 2, Kd=2 x 10-12M"
5,1atn,2.80,1992,Kd=0.45nM,(260-mer) bovine pancreatic deoxyribonuclease ...
6,1gla,2.60,1993,Kd=0.18uM,"(168-mer) Glucose specific phosphocarrier, 1.8..."
7,1acb,2.00,1993,Kd=0.2nM,(70-mer) leech (Hirudo medicinalis) protein pr...
8,2pcc,2.30,1993,Kd=1.6uM,"(108-mer) yeast iso-1-cytochrome c, Kd=1.6uM, ..."
9,2pcb,2.80,1993,Kd=10uM,"(104-mer) cytochrome c, Ka=10^5M-1, Kd=10-5M"


In [16]:
df_NL = df_NL.join(df_NL.Value.str.extract("(\w*)(\W{1,2})(\d+[.]?\d*)(\w*)"))
df_PL = df_PL.join(df_PL.Value.str.extract("(\w*)(\W{1,2})(\d+[.]?\d*)(\w*)"))
df_PN = df_PN.join(df_PN.Value.str.extract("(\w*)(\W{1,2})(\d+[.]?\d*)(\w*)"))
df_PP = df_PP.join(df_PP.Value.str.extract("(\w*)(\W{1,2})(\d+[.]?\d*)(\w*)"))

In [17]:
col_rename = {
    0: "binding_type",
    1: "binding_operator",
    2: "binding_value",
    3: "binding_units",
}

In [18]:
df_NL.rename(columns=col_rename, inplace=True)
df_PL.rename(columns=col_rename, inplace=True)
df_PN.rename(columns=col_rename, inplace=True)
df_PP.rename(columns=col_rename, inplace=True)

df_NL["type"] = "Nucleic acid - Ligand"
df_PL["type"] = "Protein - Ligand"
df_PN["type"] = "Protein - Nucleic acid"
df_PP["type"] = "Protein - Protein"

df_NL["version"] = "PDBBind2019"
df_PL["version"] = "PDBBind2019"
df_PN["version"] = "PDBBind2019"
df_PP["version"] = "PDBBind2019"

keep = [
    "binding_operator",
    "binding_type",
    "binding_units",
    "binding_value",
    "lig_name",
    "pdb_code",
    "pub_year",
    "type",
    "version",
]

df_NL = df_NL[keep]
df_PL = df_PL[keep]
df_PN = df_PN[keep]
df_PP = df_PP[keep]

In [19]:
df_all = pd.concat([df_NL, df_PL, df_PN, df_PP], ignore_index=True)

In [20]:
df_all.pdb_code = df_all.pdb_code.str.upper()

In [21]:
df_all.to_json("json_files/pdb_bind_21_01_2020.json")

In [22]:
import sqlite3

con = sqlite3.connect(
    "/data/sdecesco/databases/druggability/DB_dumps/TargetDB_20_12_19.db"
)
df_all.to_sql("pdb_bind", con=con, if_exists="append", index=False)