In [77]:
import pandas as pd


df = pd.read_json("final_json.json")
df.head()

Unnamed: 0,molecule_name,protein_target_name,binding_metric,value,unit,patent_number,target,off_target,note,time_constants,cell_line,values,binding_metrics,concentrations
0,PLX3397,CSF1 receptor,KD,5.31 ±0.51,nM,WO-2019204604-A1,,,,,,,,
1,0.5M Arg,,,0.2,,US-20170232112-A1,,,,,,,,
2,0.5M Arg,,,4.7,,US-20170232112-A1,,,,,,,,
3,"0.5M Arg, 6% hpβCD",,,3.0,,US-20170232112-A1,,,,,,,,
4,"6% hpβCD, 20 mM KPhos",,,0.6,,US-20170232112-A1,,,,,,,,


### Filter

In [78]:
required_columns = [
    "molecule_name",
    "protein_target_name",
    "binding_metric",
    "value",
    "unit",
    "patent_number"
]

# Remove rows with any null in these columns
df_clean = df.dropna(subset=required_columns)

print(f"All rows: {len(df)}")
print(f"Filtered rows: {len(df) - len(df_clean)} removed, {len(df_clean)} remain.")


All rows: 47637
Filtered rows: 37542 removed, 10095 remain.


### Find strange units

In [79]:
def normalize_unit(unit):
    # Lowercase, remove spaces, replace Greek μ with 'u'
    return str(unit).lower().replace(' ', '').replace('μ', 'u').replace('µ', 'u')

# Make a normalized unit column for comparison
df['unit_norm'] = df['unit'].apply(normalize_unit)

# Get unique units that are not 'nm'
not_nm_units = df.loc[df['unit_norm'] != 'nm', 'unit'].dropna().unique()

print("Unique unit values (not nM):")
for u in not_nm_units:
    print(f"- {u}")

Unique unit values (not nM):
- mM
- uM
- µM
- M
- pM
- %
- μM
- mU/ml
- µg/mL
- μΜ
- fM
- μg/ml
- mg ml-1
- reciprocal dilution
- micrograms per microlitre
- μg ml-1
- mg/ml
- Mu.M
- ng/mL
- minutes
- hours
- mcg/ml
- mg/kg
- μM and nM
- mu.g/mg
- mu.g/L
- microM
- μm
- mol/L
- μL
- mins
- ° C
- picomolar
- ng/ml
- Ci/mmole
- μL/min
- mmol/L
- RU
- L/mole
- nanomolar range
- Da
- μmol/kg
- nM to μM
- M^-1s^-1
- micromolar, nanomolar, picomolar
- micromolar
- nanomolar
- low nanomolar
- pM/nM
- chemiluminescent units
- %ID/g
- vs. 4.3
- %/h
- vs. 11.0
- vs. 10.3
- vs. 9.2
- vs. 4.27
- vs. 4.7
- M^-1
- μg/g
- um
- nmol/L
- min
- μmol · min/L
- % DM
- 1/x DM
- 1/X
- 1/Y
- 2.5-40nM
- mu.g/mL
- mg/mL
- kD
- μmol
- mg
- hr
- fold
- 1/M * s
- 1/Ms
- μ g/mL
- pMol/million cells/min
- 1g/kg
- 1.0g/kg
- 300U/kg
- mm
- M−1 s−1
- pg / ml
- nmol/l
- μg
- μl
- mCi
- µmol/L
- mg / l
- L/day
- L
- days
- μg/mL
- kb
- 30µM
- 10µM
- 1µM
- 0.1µM
- °C
- microliter
- μm of ol/kg
- μ
- Kcal/mole
- moles/lit

### Find strange binding_metrics

In [80]:
def normalize_metric(metric):
    # Lowercase, remove spaces
    return str(metric).strip().lower().replace(' ', '')

allowed_metrics = {'ki', 'kd', 'ic50', 'ec50'}

# Add a normalized column for comparison
df['binding_metric_norm'] = df['binding_metric'].apply(normalize_metric)

# Find unique "strange" metrics
strange_metrics = (
    df.loc[~df['binding_metric_norm'].isin(allowed_metrics), 'binding_metric']
    .dropna()
    .unique()
)

print("Strange binding_metric values (not Ki, Kd, IC50, EC50):")
for metric in strange_metrics:
    print(f"- {metric}")


Strange binding_metric values (not Ki, Kd, IC50, EC50):
- Kb
- affinity
- logIC50
- IC-50
- pEC50
- dissociation constant
- internalisation rate constant
- ED50
- pSTAT5
- pIC50
- IC₅₀
- IC_50
- K_i
- K_a
- pKi
- Emax
- inhibition
- concentration
- Km
- kcat/Km
- 2nd order rate constant
- Ka
- K_d
- Cmax
- Tmax
- AUC 0-90 min
- K_D
- IC50/Ki
- DPPH assay
- ORAC assay
- binding
- pKw
- K
- binding constant
- EC
- half life
- AUC
- Kd'
- C50
- Vmax
- Ki(app)
- binding affinity
- LD50/ED50
- pIC₅₀
- p
- Kd or Ka
- %
- MTD
- relative affinity
- lmax
- EC50 ratio
- EC80
- inhibition rate
- K_I
- on-rate
- specific activity
- apparent molecular weight
- TEPITOPE score
- koff
- kon
- Kp
- protein binding rate
- IC 90
- Kon
- K_off
- fpKi
- EC90
- IC90
- MIC
- MBC
- MBC/MIC
- K₄
- Kₑ
- 2XIC50
- K1
- DC50
- KDp
- half-life
- Vss
- CL
- T1/2
- AUClast
- ka
- k
- Survival %
- Kd1
- Kd2
- Kd, IC50
- Ks
- inhibition constant
- Bmax
- % control remaining
- zeta potential
- pKa
- endotherm
- water ac

### Value

In [81]:
import re
import json

# Функция очистки одного значения
def clean_to_string(x):
    """
    Преобразует значение к строке и убирает символы < > ~.
    Для dict / list / set сначала сериализуем в JSON‑строку.
    """
    if isinstance(x, (dict, list, set)):
        s = json.dumps(x, ensure_ascii=False)
    else:
        s = str(x)
    # удаляем символы <, >, ~
    return re.sub(r'[<>~]', '', s)

# 1. Очищаем колонку и получаем строковое представление
cleaned = df['value'].apply(clean_to_string)

# 2. Пробуем привести к числу
numeric = pd.to_numeric(cleaned, errors='coerce')

# 3. Маска «не удалось преобразовать»
mask_non_numeric = numeric.isna()

# 4. Уникальные «странные» значения
strange_values = cleaned[mask_non_numeric].dropna().unique()

print("Ненумерические значения в df['value'] (после удаления <, >, ~):")
for v in strange_values:
    print(" •", v)

Ненумерические значения в df['value'] (после удаления <, >, ~):
 • 5.31 ±0.51
 • None
 • ≤ 1
 • ≤ 100
 • ≤ 10
 • ≤ 0.1
 • ≤ 0.01
 • ≤ 0.001
 • 10^{-8}
 • less than about 1.0
 • less than about 500
 • less than about 100
 • less than about 50
 • less than about 25
 • less than about 10
 • less than about 5
 • less than about 1
 • less than about 900
 • less than about 800
 • less than about 700
 • less than about 600
 • less than about 400
 • less than about 350
 • less than about 300
 • less than about 250
 • less than about 200
 • less than about 150
 • less than about 95
 • less than about 90
 • less than about 85
 • less than about 80
 • less than about 75
 • less than about 70
 • less than about 65
 • less than about 60
 • less than about 55
 • less than about 45
 • less than about 40
 • less than about 35
 • less than about 30
 • less than about 20
 • less than about 15
 • less than about 12.5
 • less than about 6.25
 • less than about 4
 • less than about 3
 • ∼0.01
 • ∼0.7
 • 50

In [74]:
import pandas as pd

df1 = pd.read_csv("./csv/bindb_1.csv") # 24.07 Вика
df2 = pd.read_csv("./csv/ft2.csv") # Кристина
df3 = pd.read_csv("./csv/ft3.csv") # Айбулат
df4 = pd.read_csv("./csv/bindb_2.csv")  # Результаты 22.07
df5 = pd.read_csv("./csv/bindb_3.csv")  # Результаты 24.07
df6 = pd.read_csv("./csv/bindb_4.csv")  # Результаты 24.07
df7 = pd.read_csv("./csv/bindb_5.csv")  # Результаты 24.07


dfs = [df7]
df4["patent_number"] = None
for df in dfs:
    print(df.shape)
df = pd.concat(dfs, ignore_index=True)
print("+++Combined+++")
print(df.shape)

(40, 11)
+++Combined+++
(40, 11)


In [77]:

value_cols = ["Ki (nM)", "IC50 (nM)", "Kd (nM)", "EC50 (nM)"]
group_cols = ["Ligand InChI Key", "Sequence"]
extra_cols = ["protein_target_name", "patent_number", "molecule_name"]

# Convert all value columns to float; non-convertible values become NaN
df[value_cols] = df[value_cols].apply(pd.to_numeric, errors='coerce')

dfs = []
for metric in value_cols:
    # Filter for non-null and range-limited values
    filtered = df[
        df[metric].notnull()
    ].copy()
    # Columns to keep: group_cols + metric + any available in extra_cols
    cols_to_include = group_cols + [metric]
    # Find which extra columns exist in the current DataFrame
    present_cols = [col for col in extra_cols if col in df.columns]
    cols_to_include += present_cols
    # Group by group_cols, take min of metric, and also keep extra cols using 'first'
    agg = filtered.groupby(group_cols, as_index=False).agg(
        {metric: 'min', **{col: 'first' for col in present_cols}}
    )
    # For any missing extra_cols, add them as None
    for col in extra_cols:
        if col not in agg.columns:
            agg[col] = None
    # Mark the metric column
    agg['Metric'] = metric
    dfs.append(agg)

# Combine all results into one DataFrame
result = pd.concat(dfs, ignore_index=True)
result["Ligand SMILES"] = None
print(result.shape)
result.to_csv("./csv/result.csv", index=False)


(29, 11)
