In [9]:
!conda install -c conda-forge rdkit -y

Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [2]:
import os
print(os.getcwd())

C:\Users\emmad\OneDrive - University of Leeds\leeds admin\chem\3650\coding


In [4]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem


# Load dataset
csv_file = "filtered_data_updated_final2.csv"
output_folder = "xyz"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Read the CSV file
df = pd.read_csv(csv_file)

# Assuming the SMILES column is named 'solute_smiles'
if 'solute_smiles' not in df.columns:
    raise ValueError("CSV file must contain a column named 'solute_smiles'")

def smiles_to_xyz(solute_smiles, filename):
    """Converts a SMILES string to an XYZ file and saves it."""
    mol = Chem.MolFromSmiles(solute_smiles)
    if mol is None:
        print(f"Skipping invalid SMILES: {solute_smiles}")
        return

    # Add Hydrogens
    mol = Chem.AddHs(mol)

    # Generate 3D coordinates
    AllChem.EmbedMolecule(mol, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol)

    # Extract atom coordinates
    conf = mol.GetConformer()
    num_atoms = mol.GetNumAtoms()

    with open(filename, "w") as f:
        f.write(f"{num_atoms}\nGenerated by RDKit\n")
        for i in range(num_atoms):
            atom = mol.GetAtomWithIdx(i)
            pos = conf.GetAtomPosition(i)
            f.write(f"{atom.GetSymbol()} {pos.x:.6f} {pos.y:.6f} {pos.z:.6f}\n")

# Process each SMILES entry
for index, row in df.iterrows():
    solute_smiles = row['solute_smiles']
    solute_inchikey = row['solute_inchikey'] 
    output_file = os.path.join(output_folder, f"{solute_inchikey}.xyz")
    smiles_to_xyz(solute_smiles, output_file)

print(f"XYZ files saved in '{output_folder}'")



[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:07:03] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4 5 6 7 8
[15:07:03] Can't kekulize mol.  Unkekulized a

Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n1cnnc1
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C
Skipping invalid SMILES: n2c1ccccc1nc2C


[15:07:05] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15 16 17
[15:07:05] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15 16 17


Skipping invalid SMILES: [O-][N+](=O)c3ncn(c3Sc1ncnc2ncnc12)C
Skipping invalid SMILES: [O-][N+](=O)c3ncn(c3Sc1ncnc2ncnc12)C


[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15
[15:07:06] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15


Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C
Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C


[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23
[15:07:08] Can't kekulize mol.  Unkekulized atoms: 15 16 17 18 19 20 21 22 23


Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C
Skipping invalid SMILES: FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C


[15:08:21] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26
[15:08:21] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26
[15:08:21] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26
[15:08:21] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26


Skipping invalid SMILES: O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C
Skipping invalid SMILES: O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C
Skipping invalid SMILES: O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C
Skipping invalid SMILES: O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C


[15:09:16] Can't kekulize mol.  Unkekulized atoms: 11 12 13 14 15


Skipping invalid SMILES: N#CN\C(=N/C)NCCSCc1ncnc1C


[15:10:23] Can't kekulize mol.  Unkekulized atoms: 22 23 24 25 26


Skipping invalid SMILES: O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C
XYZ files saved in 'xyz'


In [17]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('filtered_data_updated2.csv')

# Define the list of SMILES strings to filter
smiles_to_keep = [
    r'n1cnnc1',  # Example SMILES string
    r'n2c1ccccc1nc2C',  
    r'[O-][N+](=O)c3ncn(c3Sc1ncnc2ncnc12)C',
    r'N#CN\C(=N/C)NCCSCc1ncnc1C',
    r'FC(F)(F)COc1c(c(ncc1)CS(=O)c3nc2ccccc2n3)C',
    r'O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C',
    r'N#CN\C(=N/C)NCCSCc1ncnc1C',
    r'O=C(O)[C@@H](N(C(=O)CCCC)Cc3ccc(c1ccccc1c2nnnn2)cc3)C(C)C'
]

# Filter the DataFrame
filtered_df = df[df['solute_smiles'].isin(smiles_to_keep)]

# Save to a new CSV file
filtered_df.to_csv('failed_smiles.csv', index=False)

print("Filtered CSV file created successfully!")


Filtered CSV file created successfully!


In [9]:
import sys
print(sys.prefix)

C:\Users\emmad\anaconda3


In [21]:
!conda install -c conda-forge openbabel

Channels:
 - conda-forge
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [23]:
!pip install requests



In [1]:
import pandas as pd
import openbabel
import pybel

# Function to convert InChIKey to SMILES using OpenBabel
def inchikey_to_smiles(inchikey):
    try:
        # Create an OpenBabel molecule from the InChIKey
        mol = pybel.readstring("inchi", f"InChI=1S/{inchikey}/")
        
        # Return the SMILES representation of the molecule
        if mol:
            return mol.write("smiles").strip()
        else:
            return None  # Return None if InChIKey is not valid
    except Exception as e:
        # Return None if there's an error in conversion
        print(f"Error converting InChIKey {inchikey}: {e}")
        return None

# Load the CSV file containing InChIKeys
df = pd.read_csv('failed_smiles.csv')

# Convert InChIKeys to SMILES (using the 'solute_inchikeys' column)
df['solute_smiles'] = df['solute_inchikeys'].apply(inchikey_to_smiles)

# Drop rows where SMILES couldn't be retrieved (i.e., None)
df = df.dropna(subset=['solute_smiles'])

# Save the updated CSV file with the SMILES
df.to_csv('resolved_smiles.csv', index=False)

print("Conversion complete. The new file is 'resolved_smiles.csv'.")



ModuleNotFoundError: No module named 'openbabel'

In [None]:
for index, row in df.iterrows():
    smiles = row['SMILES']
    molecule_name = f"molecule_{index}.xyz"
    
    # Run Open Babel command to convert SMILES to XYZ
    command = f'obabel -:"{smiles}" -O {molecule_name} --gen3d'
    subprocess.run(command, shell=True)

print("XYZ file generation complete!")