In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import rankdata

# =========================
# Path settings (robust)
# =========================

ROOT = Path().resolve().parent

DATA_DIR = ROOT / "2.1_data"
RAW_DIR = DATA_DIR / "2.1.1_raw"
PROC_DIR = DATA_DIR / "2.1.2_processed"

RAW_CSV = RAW_DIR / "series4_raw.csv"
OUT_MAIN = PROC_DIR / "01_series4_processed.csv"
OUT_CLS  = PROC_DIR / "02_series4_processed_classification.csv"
OUT_REG  = PROC_DIR / "03_series4_processed_regression.csv"

PROC_DIR.mkdir(parents=True, exist_ok=True)

print("Reading raw data from:", RAW_CSV)

# =========================
# 1) Read and clean
# =========================
df = pd.read_csv(RAW_CSV)

# Keep relevant columns from the Series 4 master list
df_proc = df[
    [
        "OSM Number",
        "SMILES",
        "PfaI EC50 uMol (Mean)",
        "PfaI EC50 uMol (Mean) Qualifier",
    ]
].copy()

# Remove rows without SMILES or activity
df_proc = df_proc.dropna(subset=["SMILES", "PfaI EC50 uMol (Mean)"])

df_proc["activity"] = pd.to_numeric(
    df_proc["PfaI EC50 uMol (Mean)"], errors="coerce"
)
df_proc = df_proc.dropna(subset=["activity"])

# =========================
# 2) Normalisation (norm_activity, 0–1, 1 = strongest)
# =========================
y = df_proc["activity"].values  # IC50 in µM
norm_activity = rankdata(-y, method="ordinal") / len(y)

# =========================
# 3) Binary label (bin_activity)
#    Threshold: IC50 <= 2.5 µM = active (1)
# =========================
bin_activity = (y <= 2.5).astype(int)

# =========================
# 4) Final processed dataframe
# =========================
s4_df = pd.DataFrame(
    {
        "osm_number": df_proc["OSM Number"].values,
        "smiles": df_proc["SMILES"].values,
        "activity": y,                  # IC50 in µM
        "norm_activity": norm_activity, # [0,1], 1 = strongest
        "bin_activity": bin_activity,   # binary label
        "ec50_qualifier": df_proc["PfaI EC50 uMol (Mean) Qualifier"].values,
    }
)

# Save main processed file
s4_df.to_csv(OUT_MAIN, index=False)
print(f"\n✅ Saved cleaned dataset to: {OUT_MAIN}")
print(s4_df.head())
print("Shape:", s4_df.shape)

# =========================
# 5) Two-column classification file
# =========================
cls_df = s4_df[["smiles", "bin_activity"]].copy()
cls_df.to_csv(OUT_CLS, index=False)

print(f"\n✅ Saved classification dataset to: {OUT_CLS}")
print(cls_df.head())
print("Shape:", cls_df.shape)
print("\nLabel counts:")
print(cls_df["bin_activity"].value_counts())

# =========================
# 6) Two-column regression file
# =========================
reg_df = s4_df[["smiles", "norm_activity"]].copy()
reg_df.to_csv(OUT_REG, index=False)

print(f"\n✅ Saved regression dataset to: {OUT_REG}")
print(reg_df.head())
print("Shape:", reg_df.shape)


Reading raw data from: /Users/ellen/Desktop/DSML/AISD Coursework 2/AISD_Coursework_2_Drug_discovery/2_BaselineReproduction_malaria/2.1_data/2.1.1_raw/series4_raw.csv

✅ Saved cleaned dataset to: /Users/ellen/Desktop/DSML/AISD Coursework 2/AISD_Coursework_2_Drug_discovery/2_BaselineReproduction_malaria/2.1_data/2.1.2_processed/01_series4_processed.csv
  osm_number                                             smiles  activity  \
0    OSM-A-1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...    3.7145   
1    OSM-A-2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...    1.2015   
2    OSM-A-3  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...    0.1190   
3   OSM-E-15  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...    3.5000   
4   OSM-E-16  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...    5.2000   

   norm_activity  bin_activity ec50_qualifier  
0       0.296178             0            NaN  
1       0.452229             1            NaN  
2       0.796178             1          

***Two-Column Data for Classification Modelling (Smiles/Bin_Activity)***

In [4]:
from pathlib import Path
import pandas as pd

# Step 1 — find correct base folder: 2_BaselineReproduction_malaria
ROOT = Path().resolve().parent

# Step 2 — correct paths based on your folder structure
proc_csv = ROOT / "2.1_data" / "2.1.2_processed" / "01_series4_processed.csv"
out_cls = ROOT / "2.1_data" / "2.1.2_processed" / "02_series4_processed_classification.csv"

print("Reading:", proc_csv)

# Step 3 — load processed dataset
df = pd.read_csv(proc_csv)

# Step 4 — build classification dataset
cls_df = df[["smiles", "bin_activity"]].copy()
cls_df.to_csv(out_cls, index=False)

print(f"Saved classification dataset to: {out_cls}")
print(cls_df.head())
print("Shape:", cls_df.shape)



Reading: /Users/ellen/Desktop/DSML/AISD Coursework 2/AISD_Coursework_2_Drug_discovery/2_BaselineReproduction_malaria/2.1_data/2.1.2_processed/01_series4_processed.csv
Saved classification dataset to: /Users/ellen/Desktop/DSML/AISD Coursework 2/AISD_Coursework_2_Drug_discovery/2_BaselineReproduction_malaria/2.1_data/2.1.2_processed/02_series4_processed_classification.csv
                                              smiles  bin_activity
0  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...             0
1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...             1
2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...             1
3  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...             0
4  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...             0
Shape: (314, 2)


***Two-Column Data for Regression Modelling (Smiles/Bin_Activity)***


In [5]:
import pandas as pd

proc_path = "2.1.2 processed/series4_processed.csv"
df = pd.read_csv(proc_path)

# For regression: keep ID (optional), smiles, and normalized activity
reg_df = df[["smiles", "norm_activity"]].copy()

out_reg = "2.1.2 processed/series4_processed_regression.csv"
reg_df.to_csv(out_reg, index=False)

print(f"Saved regression dataset to: {out_reg}")
print(reg_df.head())
print("Shape:", reg_df.shape)

Saved regression dataset to: 2.1.2 processed/series4_processed_regression.csv
                                              smiles  norm_activity
0  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...       0.296178
1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...       0.452229
2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...       0.796178
3  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...       0.305732
4  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...       0.242038
Shape: (314, 2)


In [2]:
from pathlib import Path
import pandas as pd

# Identify repo root for malaria section
ROOT = Path().resolve().parent   # this moves from /2.2_notebook → /2_BaselineReproduction_malaria

# Build the path safely
cls_path = ROOT / "2.1_data" / "2.1.2_processed" / "02_series4_processed_classification.csv"

print("Loading file from:", cls_path)

df = pd.read_csv(cls_path)
print(df.head())
print("\nLabel counts:")
print(df["bin_activity"].value_counts())


Loading file from: /Users/ellen/Desktop/DSML/AISD Coursework 2/AISD_Coursework_2_Drug_discovery/2_BaselineReproduction_malaria/2.1_data/2.1.2_processed/02_series4_processed_classification.csv
                                              smiles  bin_activity
0  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...             0
1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...             1
2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...             1
3  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...             0
4  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...             0

Label counts:
bin_activity
1    207
0    107
Name: count, dtype: int64
