In [1]:
import pandas as pd


df = pd.read_json("final_json.json")
df.head()

Unnamed: 0,molecule_name,protein_target_name,binding_metric,value,unit,is_logarithmic,patent_number,protein_uniprot_id,protein_seq_id
0,PCSK9,,Kd,17.0,Kd,0.0,EP-2296694-A2,,
1,PCSK9,,Kd,65.0,Kd,0.0,EP-2296694-A2,,
2,15u,,,,,,WO-2020237014-A1,,
3,15w,,,,,,WO-2020237014-A1,,
4,thienopyridines,NFKB,IC50,,,,WO-2020237014-A1,,


### Filter

In [2]:
required_columns = [
    "molecule_name",
    "protein_target_name",
    "binding_metric",
    "value",
    "unit",
    "patent_number"
]

# Remove rows with any null in these columns
df_clean = df.dropna(subset=required_columns)

print(f"Filtered rows: {len(df) - len(df_clean)} removed, {len(df_clean)} remain.")


Filtered rows: 11546 removed, 3619 remain.


### Find strange units

In [3]:
def normalize_unit(unit):
    # Lowercase, remove spaces, replace Greek μ with 'u'
    return str(unit).lower().replace(' ', '').replace('μ', 'u').replace('µ', 'u')

# Make a normalized unit column for comparison
df['unit_norm'] = df['unit'].apply(normalize_unit)

# Get unique units that are not 'nm'
not_nm_units = df.loc[df['unit_norm'] != 'nm', 'unit'].dropna().unique()

print("Unique unit values (not nM):")
for u in not_nm_units:
    print(f"- {u}")

Unique unit values (not nM):
- Kd
- mg/kg
- %
- μM
- uM
- M
- pM
- mg/animal
- M⁻¹min⁻¹
- μg/ml
- µM
- mol/L
- μm
- μΜ
- μM/nM
- M-1
- mM
- fold
- mg/kg p.o.
- times
- nM.h
- micromolar
- sec−1
- M−1 sec−1
- pg/mL
- KD
- μg/mouse/day
- μg
- pM-nM
- cpm/pmol
- Kd app. MW
- μl/ml
- millimolar
- μCi/μg protein
- M^-1
- pmol/g
- ng/mL
- μ mol/L
- °
- μg/kg/day
- ng/ml
- kD
- μmol/L
- none
- nmol/L
- nmol/500,000 cells/24 h
- μg/kg
- % ID/cm^3
- min
- L
- micromolar or sub-micromolar
- s-1
- Pa
- s
- kPa
- mmol/L
- ug/ml
- s−1
- nM/pM
- M−1s−1
- pg/head
- cells/mL
- cells/50 pL/mouse
- hours
- kcal/mol
- picomolar
- μg/mL
- °C
- μg PL/mL
- μg /PL/mL
- /second
- M^-1/second
- mL
- g/mL
- mCi/μmol
- μg / ml
- Molar concentration
- s^-1
- x
- ug/mL
- mg/ml
- μg protein/mL
- pmol/mg/min
- Ci/mmol
- μg mL^-1
- microL
- days
- cell/ml
- μMol
- TIU/mg protein
- 1/Ms
- 1/s
- 1/RUs
- ml/min/kg
- mg/dL of FPG per h
- h−1
- pg/μl
- nM, μM
- mg/mL
- μg per gram of gel
- μg per tooth
- m
- L/mole
- mg/k

### Find strange binding_metrics

In [4]:
def normalize_metric(metric):
    # Lowercase, remove spaces
    return str(metric).strip().lower().replace(' ', '')

allowed_metrics = {'ki', 'kd', 'ic50', 'ec50'}

# Add a normalized column for comparison
df['binding_metric_norm'] = df['binding_metric'].apply(normalize_metric)

# Find unique "strange" metrics
strange_metrics = (
    df.loc[~df['binding_metric_norm'].isin(allowed_metrics), 'binding_metric']
    .dropna()
    .unique()
)

print("Strange binding_metric values (not Ki, Kd, IC50, EC50):")
for metric in strange_metrics:
    print(f"- {metric}")


Strange binding_metric values (not Ki, Kd, IC50, EC50):
- POC
- Kds
- K_i
- K_i, EC50
- ED50
- LD50
- kₑ
- MIC
- pK
- IC
- MIC50
- IC-50
- MICso
- ICeo
- K_D
- association constant
- K
- MED
- EC50s
- Koff
- Kon
- penetration
- potency
- growth reduction
- measurable response
- specific activity
- pKw
- binding constant
- molecular weight
- minimum inhibitory concentration
- concentration
- inhibition
- pIC50
- incorporation
- IC30
- pKa
- Ki(app)
- affinity
- apparent association constant (aKa)
- EC50 ratio
- Equilibrium dissociation constant
- half-time
- steady distribution volume
- koff
- EC50 or IC50
- pEC50
- G′max
- Relaxation time
- Swelling ratio
- G′ max
- relaxation time
- G″max
- CC50
- dissociation constant
- Kj
- killing EC50
- ki*
- K<sub>i </sub>
- TI
- pKi
- ND50
- LLOQ
- Cmax
- AUC
- T1/2
- Exposure (Cmax, AUC)
- Ctrough
- k(off)
- avidity
- Kₜ
- binding constants
- binding affinity
- % inhibition
- k(on)
- K₇D
- SI
- equilibrium dissociation constant
- IC90
- IC99
- 