In [54]:
import pandas as pd


df = pd.read_json("final_json.json")
df.head()

Unnamed: 0,molecule_name,protein_target_name,binding_metric,value,unit,is_logarithmic,patent_number
0,PLX3397,CSF1 receptor,KD,5.31 ±0.51,nM,False,WO-2019204604-A1
1,Exemplified compounds,,Ki,<1,µM,False,EP-2089397-B1
2,Example 37,MCHR1,Ki,4,nM,False,EP-2089397-B1
3,Exemplified compounds,,Kb,<1,µM,False,EP-2089397-B1
4,Example 28,MCHR1,Kb,6,nM,False,EP-2089397-B1


### Filter

In [55]:
required_columns = [
    "molecule_name",
    "protein_target_name",
    "binding_metric",
    "value",
    "unit",
    "patent_number"
]

# Remove rows with any null in these columns
df_clean = df.dropna(subset=required_columns)

print(f"Filtered rows: {len(df) - len(df_clean)} removed, {len(df_clean)} remain.")


Filtered rows: 11515 removed, 3357 remain.


### Find strange units

In [56]:
def normalize_unit(unit):
    # Lowercase, remove spaces, replace Greek μ with 'u'
    return str(unit).lower().replace(' ', '').replace('μ', 'u').replace('µ', 'u')

# Make a normalized unit column for comparison
df['unit_norm'] = df['unit'].apply(normalize_unit)

# Get unique units that are not 'nm'
not_nm_units = df.loc[df['unit_norm'] != 'nm', 'unit'].dropna().unique()

print("Unique unit values (not nM):")
for u in not_nm_units:
    print(f"- {u}")

Unique unit values (not nM):
- µM
- M
- mM
- pM
- nM, μM
- μM
- fM
- Mu.M
- %
- ng/mL
- µmol/L
- nmol/L
- microM
- μmol/kg
- mg/kg
- fold
- pM/nM
- ng/ml
- pM to nM
- %ID/g
- μg
- 1/Ms
- 1/s
- % DM
- 1/x DM
- mg/ml
- pg / ml
- M^-1
- uM
- μg/kg
- ng
- μm
- picomolar
- cpm/100 μL
- mol/L
- μL
- µg/mL
- mg/kg/day
- pg/mL
- μg/ml
- μg/kd/days
- μg/mL
- mmoles
- sec−1
- M−1sec−1
- µg/ml
- M−1S−1
- °C
- ug/mL
- h
- UM
- m
- μ Ci/ μ g
- micromole
- μΜ
- cm/sec
- min
- kD
- dl/g
- µg/g
- μmol/L
- ppm
- mg/mL
- kDa
- micron
- relative fluorescence units
- μg / min
- ng/well
- OmM
- micromolar
- pmol/L
- nanomolar
- s−1
- M−1s−1
- micromol
- units
- % (v/v)
- null
- mol/l
- ug/kg
- ug/kg/hr
- hours
- ml/kg
- nmol/500,000 cells/24 h
- mV
- mg/Kg
- µg
- kcal/mol
- μM/nM
- M−1
- Liter/mol
- units/ml
- mu.g/ml
- kcal
- M^1
- s^1
- sec
- Seconds
- moles/liter
- M^−1 s^−1
- cP
- KD
- mg/l
- Mu.g/mL
- times
- hr
- U/mg
- LD50 U/mg
- mg
- mL
- molar
- μmol CB/min, μmol enzyme
- ug/ml
- M per second
- n

### Find strange binding_metrics

In [57]:
def normalize_metric(metric):
    # Lowercase, remove spaces
    return str(metric).strip().lower().replace(' ', '')

allowed_metrics = {'ki', 'kd', 'ic50', 'ec50'}

# Add a normalized column for comparison
df['binding_metric_norm'] = df['binding_metric'].apply(normalize_metric)

# Find unique "strange" metrics
strange_metrics = (
    df.loc[~df['binding_metric_norm'].isin(allowed_metrics), 'binding_metric']
    .dropna()
    .unique()
)

print("Strange binding_metric values (not Ki, Kd, IC50, EC50):")
for metric in strange_metrics:
    print(f"- {metric}")


Strange binding_metric values (not Ki, Kd, IC50, EC50):
- Kb
- affinity
- dissociation constant
- internalisation rate constant
- ED50
- K_i
- MED
- i
- K_D
- k_a
- k_d
- 1/X
- 1/Y
- Km
- Kᵢ
- LD50/ED50
- pIC50
- IC₅₀
- IC-50
- Ka
- koff
- kon
- metric
- pKi
- pEC50
- K<sub>i</sub>
- Kₑ
- IC50S
- 2XIC50
- on-rate
- KₜD
- half-life
- K3
- pAC6
- pAC7
- pAC12
- pAC13
- Kₛ
- specific activity
- LD50
- cLogP
- K<sub>i </sub>
- inhibition rate
- DC50
- nM1/2
- nM10
- GI50
- LC50
- binding
- Emax/EC50
- K
- Koff
- ID50
- kdis
- plC50
- null
- CALP affinity
- binding affinity
- % inhibition
- potency
- zeta potential
- zeta potential increase
- activity retention
- PLS(H/M)T1/2
- ICso
- IC
- EC
- Kj
- ΔG°
- kofl
- APTT Clotting Time
- Kd/Ka
- ka
- fpKi
- dissociation constants
- EC50 (CB1 ) / EC50 (CB2)
- Kdapp
- extinction coefficient
- enzyme activity
- pK<sub>a </sub>
- p-gp
- K_B
- viscosity
- Bloom strength
- Bloom value
- Imax
- New
- 20pM
- Alphascreen
- K;
- RBA0HT
- RBAOHT
- MIC
- K_

### Value

In [65]:
import re
import json

# Функция очистки одного значения
def clean_to_string(x):
    """
    Преобразует значение к строке и убирает символы < > ~.
    Для dict / list / set сначала сериализуем в JSON‑строку.
    """
    if isinstance(x, (dict, list, set)):
        s = json.dumps(x, ensure_ascii=False)
    else:
        s = str(x)
    # удаляем символы <, >, ~
    return re.sub(r'[<>~]', '', s)

# 1. Очищаем колонку и получаем строковое представление
cleaned = df['value'].apply(clean_to_string)

# 2. Пробуем привести к числу
numeric = pd.to_numeric(cleaned, errors='coerce')

# 3. Маска «не удалось преобразовать»
mask_non_numeric = numeric.isna()

# 4. Уникальные «странные» значения
strange_values = cleaned[mask_non_numeric].dropna().unique()

print("Ненумерические значения в df['value'] (после удаления <, >, ~):")
for v in strange_values:
    print(" •", v)

Ненумерические значения в df['value'] (после удаления <, >, ~):
 • 5.31 ±0.51
 • ≤ 1
 • ≤ 100
 • ≤ 10
 • ≤ 0.1
 • ≤ 0.01
 • ≤ 0.001
 • 10^{-8}
 • None
 • 80 and 45
 • 13 ± 10
 • 10.27%
 • 38.09%
 • 41.99%
 • between 70 μM and 1 nM
 • 1.8, 20, 3000
 • between 100nM and 1μM
 • between 1μM and 6μM
 • 0.1 to 200pM
 • 0.1 to 100pM
 • 0.1 to 50pM
 • 1pM to 5nM
 • 1pM to 3nM
 • 1pM to 2nM
 • 0.01 to 5ng/ml
 • 0.01 to 3ng/ml
 • 0.01 to 1.5ng/ml
 • 0.01 to 10ng/ml
 • 0.01 to 2.5ng/ml
 • 5nM or less
 • 5pM to 5nM
 • 5pM to 2nM
 • 5 to 1000pM
 • 0.1 to 30ng/ml
 • 0.1 to 20ng/ml
 • 0.1 to 10ng/ml
 • 0.2 to 6ng/ml
 • = 1000
 • 1 to 5
 • 5 to 5
 • = 20
 • 5 to 20
 • 0.2 to 200
 • =5
 • =20
 • 0.1 to 30
 • 0.1 to 20
 • 0.1 to 10
 • 0.1 to 6
 • 1x10^6
 • 1x10^7
 • 1x10^8
 • 1x10^9
 • 1x10^10
 • 1x10^11
 • 1x10^-12
 • 4x10^-10
 • 99%
 • 97%
 • 10 picomolar to 1 micromolar
 • 10 to 500 micromolar
 • 1 to 10 micromolar
 • 0.5 to 1 micromolar
 • 10 to 500 nanomolar
 • 1 to 10 nanomolar
 • 50 picomolar to 