In [None]:
# ============================================================
# columns.py
# ============================================================
import glob, pandas as pd
from pathlib import Path

CSV_DIR = "../Cleaned_data"  # pathname
OUT_DIR = Path(CSV_DIR) / "trimmed"
OUT_DIR.mkdir(exist_ok=True)

KEEP_COLS = ["Molecule ChEMBL ID", "Smiles",
             "Standard Type", "Standard Relation", "Standard Units",
             "Receptor", "Standard Value"]

for fp in glob.glob(f"{CSV_DIR}/cleaned_5HT*.csv"):
    df = pd.read_csv(fp)
    if not set(KEEP_COLS).issubset(df.columns):
        print(f"Skip {fp.name}: missing columns")
        continue
    (df[KEEP_COLS]
        .dropna(subset=["Smiles", "Standard Value"])
        .to_csv(OUT_DIR / Path(fp).name, index=False))
    print(f"  {Path(fp).name} → trimmed.")

  cleaned_5HT5A.csv → trimmed.
  cleaned_5HT7.csv → trimmed.
  cleaned_5HT6.csv → trimmed.
  cleaned_5HT2A.csv → trimmed.
  cleaned_5HT2B.csv → trimmed.
  cleaned_5HT1A.csv → trimmed.


In [None]:
# ============================================================
# concat_long.py
# Combine all trimmed cleaned_5HT*.csv into a single Parquet file
# ============================================================

import glob, pandas as pd
from pathlib import Path

SCRIPT_DIR = Path("..")/"Cleaned_data"     
TRIM_DIR   = SCRIPT_DIR / "trimmed"               
TRIM_DIR.mkdir(exist_ok=True)                    

out_long = SCRIPT_DIR / "ki_long.parquet"         


dfs = [pd.read_csv(fp) for fp in glob.glob(f"{TRIM_DIR}/cleaned_5HT*.csv")]
long_df = (pd.concat(dfs, ignore_index=True)
             .drop_duplicates(subset=["Smiles", "Receptor"]))

long_df.to_parquet(out_long, index=False)
print(f"  saved {len(long_df):,} rows to {out_long}")
 

  saved 15,975 rows to /Users/anastazja/magisterka 2024:2025/sem 1/phyton/Project/5HT5A/ML_LAB/5HT/ki_long.parquet


In [None]:
# ============================================================
# common_smiles.py 
# Extract SMILES present across all receptor datasets from a concatenated Parquet file
# ============================================================
import pandas as pd
from pathlib import Path

SCRIPT_DIR = Path("..")/"Cleaned_data"        # script
LONG_PATH  = SCRIPT_DIR / "ki_long.parquet" 

df = pd.read_parquet(LONG_PATH)

receptors = df["Receptor"].unique()
print("Receptors in dataset:", receptors)

# Count how many receptors each SMILES appears in
counts = (df.groupby("Smiles")["Receptor"]
            .nunique()
            .reset_index(name="n_receptors"))

common = counts[counts["n_receptors"] == len(receptors)]["Smiles"]
print(f"SMILES in all receptors: {len(common)}")

out = Path(LONG_PATH).parent / "ligands_for_inference.csv"
common.to_csv(out, index=False)
print(f"Saved common SMILES list to  {out}")

Receptors in dataset: ['5HT5A' '5HT7' '5HT6' '5HT2A' '5HT2B' '5HT1A']
SMILES in all receptors: 49
Saved common SMILES list to  /Users/anastazja/magisterka 2024:2025/sem 1/phyton/Project/5HT5A/ML_LAB/5HT/ligands_for_inference.csv
