***Data Curation (removed unwanted columns and rows)***

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import rankdata

# ==== Name ====
INPUT_CSV = "2.1.1 raw/series4_raw.csv"
OUTPUT_DIR = "2.1.2 processed"
OUTPUT_CSV = os.path.join(OUTPUT_DIR, "series4_processed.csv")

os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==== Read file ====
df = pd.read_csv(INPUT_CSV)

# Keep relevant columns from the Series 4 master list
df_proc = df[
    [
        "OSM Number",
        "SMILES",
        "PfaI EC50 uMol (Mean)",
        "PfaI EC50 uMol (Mean) Qualifier",
    ]
].copy()

# remove rows without SMILES or activity
df_proc = df_proc.dropna(subset=["SMILES", "PfaI EC50 uMol (Mean)"])

df_proc["activity"] = pd.to_numeric(df_proc["PfaI EC50 uMol (Mean)"], errors="coerce")
df_proc = df_proc.dropna(subset=["activity"])

# ==== Normalisation (norm_activity, 0–1, 1 = strongest) ====
y = df_proc["activity"].values  # IC50 in µM
norm_activity = rankdata(-y, method="ordinal") / len(y)

# ==== Binary sorting (bin_activity) ====
# Same threshold style as your previous code: IC50 <= 2.5 µM = active (1)
bin_activity = (y <= 2.5).astype(int)

# ==== final DataFrame ====
s4_df = pd.DataFrame(
    {
        "osm_number": df_proc["OSM Number"].values,
        "smiles": df_proc["SMILES"].values,
        "activity": y,                               # IC50 in µM
        "norm_activity": norm_activity,              # [0,1], 1 = strongest
        "bin_activity": bin_activity,                # binary label
        "ec50_qualifier": df_proc["PfaI EC50 uMol (Mean) Qualifier"].values,
    }
)

# ==== Output result ====
s4_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved cleaned dataset to: {OUTPUT_CSV}")
print(s4_df.head())
print("Shape:", s4_df.shape)


Saved cleaned dataset to: 2.1.2 processed/series4_processed.csv
  osm_number                                             smiles  activity  \
0    OSM-A-1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...    3.7145   
1    OSM-A-2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...    1.2015   
2    OSM-A-3  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...    0.1190   
3   OSM-E-15  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...    3.5000   
4   OSM-E-16  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...    5.2000   

   norm_activity  bin_activity ec50_qualifier  
0       0.296178             0            NaN  
1       0.452229             1            NaN  
2       0.796178             1            NaN  
3       0.305732             0            NaN  
4       0.242038             0            NaN  
Shape: (314, 6)


***Two-Column Data for Classification Modelling (Smiles/Bin_Activity)***

In [6]:
import pandas as pd

proc_path = "2.1.2 processed/series4_processed.csv"
df = pd.read_csv(proc_path)

# For classification: keep ID (optional), smiles, and binary label
cls_df = df[["smiles", "bin_activity"]].copy()

out_cls = "2.1.2 processed/series4_processed_classification.csv"
cls_df.to_csv(out_cls, index=False)

print(f"Saved classification dataset to: {out_cls}")
print(cls_df.head())
print("Shape:", cls_df.shape)


Saved classification dataset to: 2.1.2 processed/series4_processed_classification.csv
                                              smiles  bin_activity
0  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...             0
1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...             1
2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...             1
3  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...             0
4  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...             0
Shape: (314, 2)


***Two-Column Data for Regression Modelling (Smiles/Bin_Activity)***


In [5]:
import pandas as pd

proc_path = "2.1.2 processed/series4_processed.csv"
df = pd.read_csv(proc_path)

# For regression: keep ID (optional), smiles, and normalized activity
reg_df = df[["smiles", "norm_activity"]].copy()

out_reg = "2.1.2 processed/series4_processed_regression.csv"
reg_df.to_csv(out_reg, index=False)

print(f"Saved regression dataset to: {out_reg}")
print(reg_df.head())
print("Shape:", reg_df.shape)

Saved regression dataset to: 2.1.2 processed/series4_processed_regression.csv
                                              smiles  norm_activity
0  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(S(=O)(N...       0.296178
1  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(C#N)C=C...       0.452229
2  O=C(/C(S/1)=C/C2=C(C)N(C(C)=C2)C3=CC=C(I)C=C3)...       0.796178
3  FC(C(F)=C1)=CC=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N...       0.305732
4  ClC(C=CC=C1)=C1CCOC2=CC=CC3=NN=C(C4=CC=C(C#N)C...       0.242038
Shape: (314, 2)


In [9]:
import pandas as pd

df = pd.read_csv("2.1.2 processed/series4_processed_classificatin.csv")
print(df['bin_activity'].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: '2.1.2 processed/series4_processed_classificatin.csv'