In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install duckdb rdkit --quiet

In [3]:
import duckdb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pyarrow as pa
import pyarrow.parquet as pq

from tqdm import tqdm
from pathlib import Path
import itertools
from collections import defaultdict

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from collections import Counter


train_pq = pq.ParquetFile("/content/drive/MyDrive/DS5500/train.parquet")

SAVEDIR = Path("/content/drive/MyDrive/DS5500/")
SAVEDIR.mkdir(exist_ok=True)

DRAFT_MODE = False

train_path = '/content/drive/MyDrive/DS5500/train.parquet'
test_path = '/content/drive/MyDrive/DS5500/test.parquet'

con = duckdb.connect()

In [4]:
train_df = con.query(f"""(SELECT * FROM parquet_scan('{train_path}'))""").df()

train_df.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0


In [5]:
protein_names = ["BRD4", "HSA", "sEH"]

smiles_col_names = [
    "buildingblock1_smiles",
    "buildingblock2_smiles",
    "buildingblock3_smiles",
    "molecule_smiles"
]


# SCHEMA of SMILE molecules that have binding affinity with one or more proteins
schema_mixed_target = pa.schema([
    (col_name, pa.string())
    for col_name in smiles_col_names
] + [(prot_name, pa.int8()) for prot_name in protein_names])

In [6]:
schema_mixed_target

buildingblock1_smiles: string
buildingblock2_smiles: string
buildingblock3_smiles: string
molecule_smiles: string
BRD4: int8
HSA: int8
sEH: int8

In [15]:
train_df_wide = pd.DataFrame()
for i in tqdm(range(6)):
    print(train_df.loc[i, 'molecule_smiles'], train_df.loc[i, 'protein_name'], train_df.loc[i, 'binds'])


100%|██████████| 6/6 [00:00<00:00, 1690.12it/s]

C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 BRD4 0
C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 HSA 0
C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 sEH 0
C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 BRD4 0
C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 HSA 0
C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC#C)CC(=O)N[Dy])n2)cc1 sEH 0





In [10]:
subset_train_df = train_df.loc[0:12, :]
subset_train_df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0


In [13]:
# Pivot the dataframe
train_df_wide = train_df.pivot_table(index=["buildingblock1_smiles", "buildingblock2_smiles",
                                "buildingblock3_smiles", "molecule_smiles"],
                         columns="protein_name", values="binds", fill_value=0).reset_index()
# Ensure integer data type for protein columns
train_df_wide = train_df_wide.astype({col: 'int32' for col in train_df_wide.columns if col in ["BRD4", "HSA", "sEH"]})

In [14]:
train_df_wide.head()

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,0,0,0
1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,0,0,0
2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,C#CCOc1ccc(CN)cc1.Cl,C#CCOc1ccc(CNc2nc(NCc3ccc(OCC#C)cc3)nc(N[C@@H]...,0,0,0
3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,C=C(C)C(=O)NCCN.Cl,C#CCOc1ccc(CNc2nc(NCCNC(=O)C(=C)C)nc(N[C@@H](C...,0,0,0
4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,C=CCNC(=O)CN.Cl,C#CCOc1ccc(CNc2nc(NCC(=O)NCC=C)nc(N[C@@H](CC#C...,0,0,0


In [15]:
train_df_wide.tail()

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
98415605,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,Nc1ncnc2c1ncn2C1CCCCO1,[N-]=[N+]=NCCC[C@H](Nc1nc(Nc2noc3ccc(F)cc23)nc...,0,0,0
98415606,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,Nc1nnc(S)s1,[N-]=[N+]=NCCC[C@H](Nc1nc(Nc2nnc(S)s2)nc(Nc2no...,0,0,0
98415607,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,Nc1nncs1,[N-]=[N+]=NCCC[C@H](Nc1nc(Nc2nncs2)nc(Nc2noc3c...,0,0,0
98415608,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,Nc1nnn[nH]1,[N-]=[N+]=NCCC[C@H](Nc1nc(Nc2nnn[nH]2)nc(Nc2no...,0,0,0
98415609,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,Nc1noc2ccc(F)cc12,[N-]=[N+]=NCCC[C@H](Nc1nc(Nc2noc3ccc(F)cc23)nc...,0,0,0


In [17]:
import zipfile
import os

# Specify the file path
csv_file_path = "/content/drive/MyDrive/DS5500/train_df_wide.csv"
zip_file_path = "/content/drive/MyDrive/DS5500/train_df_wide.zip"

# Check if the file exists
if not os.path.exists(csv_file_path):
    print(f"{csv_file_path} does not exist. Creating the file...")
    # Example DataFrame to save (replace this with your actual DataFrame)
    train_df_wide.to_csv(csv_file_path, index=False)
    print(f"File {csv_file_path} created successfully.")

# Create a ZIP file and add the CSV file to it
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(csv_file_path, arcname="train_df_wide.csv")

print(f"CSV file has been compressed and saved as {zip_file_path}")

/content/drive/MyDrive/DS5500/train_df_wide.csv does not exist. Creating the file...
File /content/drive/MyDrive/DS5500/train_df_wide.csv created successfully.
CSV file has been compressed and saved as /content/drive/MyDrive/DS5500/train_df_wide.zip


In [28]:
all_zeros_train_df = train_df_wide[(train_df_wide['BRD4'] == 0) & (train_df_wide['HSA'] == 0) & (train_df_wide['sEH'] == 0)]


In [29]:
all_ones_train_df = train_df_wide[(train_df_wide['BRD4'] == 1) | (train_df_wide['HSA'] == 1)  | (train_df_wide['sEH'] == 1)]

In [31]:
2 * all_ones_train_df.shape[0]

3019558

In [33]:
all_zeros_subset_df = all_zeros_train_df.sample(n= 2 * all_ones_train_df.shape[0], random_state=42)

In [34]:
all_zeros_subset_df

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
14320392,CSc1ncc(NC(=O)OCC2c3ccccc3-c3ccccc32)c(C(=O)O)n1,Cl.Cl.NCCC(=O)Nc1ccncc1,Cl.NCc1ccsc1C(F)(F)F,CSc1ncc(Nc2nc(NCCC(=O)Nc3ccncc3)nc(NCc3ccsc3C(...,0,0,0
76633618,O=C(Nc1ncc(Br)nc1C(=O)O)OCC1c2ccccc2-c2ccccc21,CC(=O)c1ccc(N)c(F)c1,NC/C=C/Br,CC(=O)c1ccc(Nc2nc(NC/C=C/Br)nc(Nc3ncc(Br)nc3C(...,0,0,0
10438210,COc1cc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)cn1,NCc1cccc(C(F)(F)F)c1,Nc1cc(C2CC2)[nH]n1,COc1cc(C(=O)N[Dy])c(Nc2nc(NCc3cccc(C(F)(F)F)c3...,0,0,0
11867361,COc1ccc(C[C@H](NC(=O)OCC2c3ccccc3-c3ccccc32)C(...,NCC1(O)CC1,N#Cc1nc[nH]c1N,COc1ccc(C[C@H](Nc2nc(NCC3(O)CC3)nc(Nc3[nH]cnc3...,0,0,0
83782661,O=C(O)C[C@@H](Cc1ccc([N+](=O)[O-])cc1)NC(=O)OC...,NCCCCN1CCCC1,CC1CC(CN)C(C)O1,CC1CC(CNc2nc(NCCCCN3CCCC3)nc(N[C@@H](CC(=O)N[D...,0,0,0
...,...,...,...,...,...,...,...
32291291,O=C(N[C@@H](Cc1ccc(Cl)cc1Cl)C(=O)O)OCC1c2ccccc...,Nc1cccnc1[N+](=O)[O-],Nc1nc2ccc(Cl)cc2s1,O=C(N[Dy])[C@H](Cc1ccc(Cl)cc1Cl)Nc1nc(Nc2nc3cc...,0,0,0
44933697,O=C(Nc1c(Br)cc(F)cc1C(=O)O)OCC1c2ccccc2-c2ccccc21,NCc1cc(-c2ccccc2)[nH]n1,COC(=O)c1occc1N,COC(=O)c1occc1Nc1nc(NCc2cc(-c3ccccc3)[nH]n2)nc...,0,0,0
22163036,N#Cc1ccc(C[C@H](NC(=O)OCC2c3ccccc3-c3ccccc32)C...,CC(C)(C)OC(=O)N1CCN(c2ccccc2N)CC1,CC(CN)c1c(Cl)cccc1Cl,CC(CNc1nc(Nc2ccccc2N2CCN(C(=O)OC(C)(C)C)CC2)nc...,0,0,0
47975283,O=C(Nc1c(Cl)c(Cl)nc(C(=O)O)c1Cl)OCC1c2ccccc2-c...,CCON(C)C(=O)CN.Cl,CC(O)CCN,CCON(C)C(=O)CNc1nc(NCCC(C)O)nc(Nc2c(Cl)c(Cl)nc...,0,0,0


In [35]:
all_ones_train_df

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
155,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Cc1cc2cc(CN)ccc2[nH]1,C#CCOc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[C...,0,1,1
227,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Cl.NCC12CC3CC(CC(C3)C1)C2,C#CCOc1ccc(CNc2nc(NCC34CC5CC(CC(C5)C3)C4)nc(N[...,0,0,1
440,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Nc1n[nH]c2ncccc12,C#CCOc1ccc(CNc2nc(Nc3n[nH]c4ncccc34)nc(N[C@@H]...,0,1,0
713,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1cccc(CN)c1.Cl,Cc1cc2cc(CN)ccc2[nH]1,C#CCOc1cccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[...,0,0,1
1246,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#Cc1ccc(N)cc1,CCOC(=O)c1nonc1N,C#CC[C@@H](CC(=O)N[Dy])Nc1nc(Nc2ccc(C#C)cc2)nc...,0,1,0
...,...,...,...,...,...,...,...
98414388,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nnc(S)s1,NCc1cnc(-c2ccccc2)s1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cnc(-c3ccccc3)s2...,0,1,0
98414617,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nncs1,Cc1cc(N)n[nH]1,Cc1cc(Nc2nc(Nc3nncs3)nc(N[C@@H](CCCN=[N+]=[N-]...,0,1,0
98414696,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nncs1,Cl.Cn1cc(N)ccc1=O,Cn1cc(Nc2nc(Nc3nncs3)nc(N[C@@H](CCCN=[N+]=[N-]...,1,0,0
98414813,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1nncs1,NCCc1nccc(C2CC2)n1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCCc2nccc(C3CC3)n2)n...,0,1,0


In [37]:
train_zeros = all_zeros_subset_df.sample(frac=0.75, random_state=42)
train_ones = all_ones_train_df.sample(frac=0.75, random_state=42)

In [38]:
val_zeros = all_zeros_subset_df.drop(train_zeros.index)
val_ones = all_ones_train_df.drop(train_ones.index)

In [39]:
train_df = pd.concat([train_zeros, train_ones])
val_df = pd.concat([val_zeros, val_ones])

In [47]:
val_df[(val_df['BRD4'] == 1) & (val_df['HSA'] == 1) & (val_df['sEH'] == 1)]

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
6065263,CC(C)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,NCCC1CSC1,Cl.Cl.NCc1ccc(-n2cncn2)cc1,CC(C)CC(Nc1nc(NCCC2CSC2)nc(NCc2ccc(-n3cncn3)cc...,1,1,1
6428656,CC(OC(C)(C)C)C(NC(=O)OCC1c2ccccc2-c2ccccc21)C(...,NCCC1CSC1,Nc1ccc(F)cc1F,CC(OC(C)(C)C)C(Nc1nc(NCCC2CSC2)nc(Nc2ccc(F)cc2...,1,1,1
24057835,O=C(NC(CC1CCCCC1)C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(N)cc1OC,Cc1cc2cc(CN)ccc2[nH]1,COc1ccc(Nc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(NC(CC3C...,1,1,1
24781104,O=C(NC(Cc1ccccc1)C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(CN)c(C)c1OC,Cc1cc2cc(CN)ccc2[nH]1,COc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(NC(Cc3...,1,1,1
24786958,O=C(NC(Cc1ccccc1)C(=O)O)OCC1c2ccccc2-c2ccccc21,COc1ccc(OC)c(N)c1,Cc1cc2cc(CN)ccc2[nH]1,COc1ccc(OC)c(Nc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(NC...,1,1,1
27157224,O=C(NC1(C(=O)O)CCc2ccccc21)OCC1c2ccccc2-c2ccccc21,Nc1cc(Cl)c(O)c(Cl)c1,Cc1cc2cc(CN)ccc2[nH]1,Cc1cc2cc(CNc3nc(Nc4cc(Cl)c(O)c(Cl)c4)nc(NC4(C(...,1,1,1
62589725,O=C(Nc1ccc(C(=O)O)cc1)OCC1c2ccccc2-c2ccccc21,Cc1cc(N)nnc1Cl,NCC1CN2CCN1CC2,Cc1cc(Nc2nc(NCC3CN4CCN3CC4)nc(Nc3ccc(C(=O)N[Dy...,1,1,1
62667782,O=C(Nc1ccc(C(=O)O)cc1)OCC1c2ccccc2-c2ccccc21,Cl.NCCNC(=O)c1ccccc1F,Cc1cc2cc(CN)ccc2[nH]1,Cc1cc2cc(CNc3nc(NCCNC(=O)c4ccccc4F)nc(Nc4ccc(C...,1,1,1
71306089,O=C(Nc1cccc(I)c1C(=O)O)OCC1c2ccccc2-c2ccccc21,Cc1cc(N)nnc1Cl,NCC1CN2CCN1CC2,Cc1cc(Nc2nc(NCC3CN4CCN3CC4)nc(Nc3cccc(I)c3C(=O...,1,1,1


In [43]:
val_df.head()

protein_name,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,BRD4,HSA,sEH
83782661,O=C(O)C[C@@H](Cc1ccc([N+](=O)[O-])cc1)NC(=O)OC...,NCCCCN1CCCC1,CC1CC(CN)C(C)O1,CC1CC(CNc2nc(NCCCCN3CCCC3)nc(N[C@@H](CC(=O)N[D...,0,0,0
15474540,Cc1c(Br)ccc(C(=O)O)c1NC(=O)OCC1c2ccccc2-c2ccccc21,Cl.NCc1nc2ccccc2o1,Cl.NCc1nc(-c2ccco2)n[nH]1,Cc1c(Br)ccc(C(=O)N[Dy])c1Nc1nc(NCc2nc(-c3ccco3...,0,0,0
46606236,O=C(Nc1c(C(=O)O)cccc1C(=O)O)OCC1c2ccccc2-c2ccc...,Cc1cc(N)n(-c2ccccc2)n1,COc1ccc([N+](=O)[O-])c(N)n1,COc1ccc([N+](=O)[O-])c(Nc2nc(Nc3c(C(=O)O)cccc3...,0,0,0
68170437,O=C(Nc1ccc([N+](=O)[O-])c(C(=O)O)c1)OCC1c2cccc...,NCCN1CCC2(C1)OCCO2,Cc1n[nH]c(N)c1C,Cc1n[nH]c(Nc2nc(NCCN3CCC4(C3)OCCO4)nc(Nc3ccc([...,0,0,0
17865158,Cc1ccc(C(=O)O)c(NC(=O)OCC2c3ccccc3-c3ccccc32)c1C,COC(CN)CC(N)=O.Cl,Cl.Nc1ccc(O)cc1Cl,COC(CNc1nc(Nc2ccc(O)cc2Cl)nc(Nc2c(C(=O)N[Dy])c...,0,0,0


In [48]:
# Specify the file path
train_csv_file_path = "/content/drive/MyDrive/DS5500/train_df_subset.csv"
val_csv_file_path = "/content/drive/MyDrive/DS5500/val_df_subset.csv"

# Check if train file exists
if not os.path.exists(train_csv_file_path):
    print(f"{train_csv_file_path} does not exist. Creating the file...")
    # Example DataFrame to save (replace this with your actual DataFrame)
    train_df.to_csv(train_csv_file_path, index=False)
    print(f"File {train_csv_file_path} created successfully.")

# Check if val file exists
if not os.path.exists(val_csv_file_path):
    print(f"{val_csv_file_path} does not exist. Creating the file...")
    # Example DataFrame to save (replace this with your actual DataFrame)
    val_df.to_csv(val_csv_file_path, index=False)
    print(f"File {val_csv_file_path} created successfully.")

/content/drive/MyDrive/DS5500/train_df_subset.csv does not exist. Creating the file...
File /content/drive/MyDrive/DS5500/train_df_subset.csv created successfully.
/content/drive/MyDrive/DS5500/val_df_subset.csv does not exist. Creating the file...
File /content/drive/MyDrive/DS5500/val_df_subset.csv created successfully.
