<a href="https://colab.research.google.com/github/danicardonaibz/molecular_equilibrium/blob/main/ML_Material_Science.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project in machine Learning for Materials Science

https://github.com/tonyreina/chemistry/blob/main/chemistry_predict_logP_tensorflow.ipynb

In [2]:
# First we will install some dependencies
%pip install rdkit
%pip install py3Dmol
%pip install git+https://github.com/samoturk/mol2vec

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.5 MB)
[K     |████████████████████████████████| 29.5 MB 1.4 MB/s 
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py3Dmol
  Downloading py3Dmol-1.8.1-py2.py3-none-any.whl (6.5 kB)
Installing collected packages: py3Dmol
Successfully installed py3Dmol-1.8.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/samoturk/mol2vec
  Cloning https://github.com/samoturk/mol2vec to /tmp/pip-req-build-hpg3o_yk
  Running command git clone -q https://github.com/samoturk/mol2vec /tmp/pip-req-build-hpg3o_yk
Collecting jedi>=0.10
  Downloading jedi-0.18.1-py2.py3-none-any.whl (1.6 MB)

In [3]:
import pandas as pd
import numpy as np
import re
from google.colab import drive
from itertools import zip_longest
from collections import defaultdict

import json
from matplotlib import pyplot as plt
import seaborn as sns
from rdkit import Chem

## Step 1: Loading our data

In [4]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_json("/content/drive/MyDrive/m1507656/df_62k.json", orient='split')

In [6]:
df.head(4)

Unnamed: 0,refcode_csd,canonical_smiles,inchi,number_of_atoms,xyz_pbe_relaxed,energies_occ_pbe,energies_occ_pbe0_vac_tier2,energies_occ_pbe0_water,energies_occ_pbe0_vac_tzvp,energies_occ_pbe0_vac_qzvp,...,energies_unocc_gw_qzvp,cbs_unocc_gw,total_energy_pbe,total_energy_pbe0_vac_tier2,total_energy_pbe0_water,total_energy_pbe0_vac_tzvp,total_energy_pbe0_vac_qzvp,hirshfeld_pbe,hirshfeld_pbe0_vac_tier2,hirshfeld_pbe0_water
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,InChI=1S/C18H18ClN3O3S2/c1-12-3-9-16(10-4-12)2...,45,45\n\nO 39.55724515 33.75271314 ...,"[-2759.71553, -2416.30896, -2412.55736, -513.9...","[-2788.05127, -2443.36662, -2439.20276, -526.4...",,,,...,,,-63908.261677,-63911.48593,,,,"[-0.27470711000000003, 0.44228032, -0.26425776...","[-0.29802765000000003, 0.47691104, -0.28757556...",
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,InChI=1S/C28H22N2O2S/c29-21-26(27(22-13-5-1-6-...,55,55\n\nC 8.74281024 13.44863575 ...,"[-2416.32097, -510.97599, -510.93427, -382.106...","[-2443.36734, -523.36541, -523.31866, -392.998...",,,,...,,,-47344.131203,-47346.537633,,,,"[-0.04383635, 0.0489517, -0.04317942, 0.049465...","[-0.04511281, 0.05043578, -0.04467607, 0.05086...",
2,LODZOT,n1ccc(cc1)c1nnc(o1)c1cccs1\t\n,InChI=1S/C11H7N3OS/c1-2-9(16-7-1)11-14-13-10(1...,23,23\n\nN 23.84904338 42.50577669 ...,"[-2412.75371, -513.88451, -382.11294, -382.041...","[-2439.38001, -526.39732, -393.09445, -393.002...",,,,...,,,-28915.979909,-28916.82663,,,,"[-0.15366001, 0.01755584, 0.05299325, -0.04774...","[-0.16233532, 0.02121239, 0.05594731, -0.06621...",
3,LUSREW,CC/C=C(/S(=O)(=O)c1ccc(cc1)C)\F\t\n,"InChI=1S/C11H13FO2S/c1-3-4-11(12)15(13,14)10-7...",28,28\n\nC 36.52340453 39.64067030 ...,"[-2416.01387, -662.1882, -510.96036, -510.9032...","[-2443.06569, -676.06906, -523.38098, -523.310...","[-2443.54535, -676.3259, -523.94531, -523.9294...",,,...,,,-29310.5254,-29311.929019,-29312.152528,,,"[-0.10553088000000001, 0.052016810000000004, 0...","[-0.10641357, 0.053639570000000004, 0.05095191...","[-0.10396546, 0.06349725, 0.0567905, 0.0529875..."


In [7]:
df.reset_index(inplace=True, drop = True)

In [8]:
df.columns.tolist()

['refcode_csd',
 'canonical_smiles',
 'inchi',
 'number_of_atoms',
 'xyz_pbe_relaxed',
 'energies_occ_pbe',
 'energies_occ_pbe0_vac_tier2',
 'energies_occ_pbe0_water',
 'energies_occ_pbe0_vac_tzvp',
 'energies_occ_pbe0_vac_qzvp',
 'energies_occ_gw_tzvp',
 'energies_occ_gw_qzvp',
 'cbs_occ_gw',
 'energies_unocc_pbe',
 'energies_unocc_pbe0_vac_tier2',
 'energies_unocc_pbe0_water',
 'energies_unocc_pbe0_vac_tzvp',
 'energies_unocc_pbe0_vac_qzvp',
 'energies_unocc_gw_tzvp',
 'energies_unocc_gw_qzvp',
 'cbs_unocc_gw',
 'total_energy_pbe',
 'total_energy_pbe0_vac_tier2',
 'total_energy_pbe0_water',
 'total_energy_pbe0_vac_tzvp',
 'total_energy_pbe0_vac_qzvp',
 'hirshfeld_pbe',
 'hirshfeld_pbe0_vac_tier2',
 'hirshfeld_pbe0_water']

## Step 2: Preprocessing the data

### 2.1: Subsetting the dataframe

In [9]:
relevant_columns = ['refcode_csd', 'canonical_smiles', 'xyz_pbe_relaxed', 'total_energy_pbe']
df_subset = df[relevant_columns]
df_subset.head()

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,total_energy_pbe
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,45\n\nO 39.55724515 33.75271314 ...,-63908.261677
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,55\n\nC 8.74281024 13.44863575 ...,-47344.131203
2,LODZOT,n1ccc(cc1)c1nnc(o1)c1cccs1\t\n,23\n\nN 23.84904338 42.50577669 ...,-28915.979909
3,LUSREW,CC/C=C(/S(=O)(=O)c1ccc(cc1)C)\F\t\n,28\n\nC 36.52340453 39.64067030 ...,-29310.5254
4,NOMBEA01,OC(=O)c1cc(N(=O)=O)c(c(c1)N(=O)=O)C\t\n,22\n\nC 3.95256786 17.67855833 24.82877401 \nN...,-23659.206412


### 2.2: Cleaning canonical SMILES
They happen to have \t\n characters to indicate end of line. These are going to be removed using a regex expression to match the rest of the strings.

In [10]:
df_subset["canonical_smiles"]

0        Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...
1        N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...
2                           n1ccc(cc1)c1nnc(o1)c1cccs1\t\n
3                      CC/C=C(/S(=O)(=O)c1ccc(cc1)C)\F\t\n
4                  OC(=O)c1cc(N(=O)=O)c(c(c1)N(=O)=O)C\t\n
                               ...                        
61484    COC(=O)c1c(c(C(=O)OC)c(c(c1C(C)(C)C)C(=O)OC)C(...
61485             O=C(c1ccccc1O)N/N=C/c1ccc(c(c1)Cl)Cl\t\n
61486         C1=CC=C2C[C]([CH]1)[C]1[CH]C=CC=C(C1)CC2\t\n
61487    Cc1ccc(cc1)S(=O)(=O)N[C@H](C(=O)N1C(=O)OC[C@@H...
61488    OC[C@H]1O[C@H]([C@@H]([C@@H]1O)O)c1n[nH]c2c1nc...
Name: canonical_smiles, Length: 61489, dtype: object

In [11]:
regex = "\\t\\n"
df_subset["canonical_smiles"]=df_subset["canonical_smiles"].apply(lambda x: re.sub(regex, "", x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
df_subset.head(2)

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,total_energy_pbe
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,45\n\nO 39.55724515 33.75271314 ...,-63908.261677
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,55\n\nC 8.74281024 13.44863575 ...,-47344.131203


###2.2: Converting SMILES strings to RD-Kit molecules

In [13]:
df_subset["mol"] = df_subset["canonical_smiles"].apply(lambda x: (Chem.MolFromSmiles(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


RD-Kit outputs NoneType when fails to parse a SMILES string. Thus, to prevent exceptions when applying further transformations to the dataset, conflicting molecules are going to be removed.

In [14]:
df_subset_clean = df_subset[df_subset['mol'].apply(lambda x: x is not None)].reset_index(drop=True)
len(df_subset), len(df_subset_clean)

(61489, 61037)

In [15]:
# Adding hydrogen atoms to the molecules
df_subset_clean['mol'] = df_subset_clean['mol'].apply(lambda x: Chem.AddHs(x))

In [16]:
df_subset_clean.head(2)

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,total_energy_pbe,mol
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,45\n\nO 39.55724515 33.75271314 ...,-63908.261677,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432940>
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,55\n\nC 8.74281024 13.44863575 ...,-47344.131203,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432a30>


###2.3: Reshaping XYZ coordinates
TODO: apply MBTR to xyz to make a proper descriptor out of the atomic positions


In [41]:
df.iloc[0].xyz_pbe_relaxed

'45\n\nO      39.55724515      33.75271314      38.40917207 \nS      40.85906611      34.05164612      38.96498639 \nO      42.08066828      33.95284000      38.19595313 \nC      42.32571194      32.73322762      40.89834532 \nH      43.19968501      32.99032401      40.30295436 \nCl      45.20041227      29.36338131      46.43043646 \nC      41.05184934      33.09086636      40.45377016 \nC      42.45668317      32.04677947      42.10437138 \nH      43.45084814      31.76806349      42.45397500 \nC      41.33623713      31.70625411      42.87422993 \nC      40.06720668      32.07018496      42.39744704 \nH      39.18097405      31.80058002      42.97364913 \nC      39.91665528      32.75758494      41.19816166 \nH      38.92751728      33.01994634      40.82523353 \nC      41.48361320      30.98558291      44.18381422 \nH      42.41118162      30.40352859      44.22479339 \nH      40.63985814      30.30766109      44.36544785 \nH      41.51361125      31.70368231      45.01623603 \nC 

In [17]:
df_subset_clean.iloc[0].xyz_pbe_relaxed.split()[1:9]

['O',
 '39.55724515',
 '33.75271314',
 '38.40917207',
 'S',
 '40.85906611',
 '34.05164612',
 '38.96498639']

In [18]:
df_subset_clean.iloc[0].xyz_pbe_relaxed.split()[:10]

['45',
 'O',
 '39.55724515',
 '33.75271314',
 '38.40917207',
 'S',
 '40.85906611',
 '34.05164612',
 '38.96498639',
 'O']

In [19]:
def grouper(iterable, n, fillvalue=None):
  """
  This function splits the xyz array into N groups of 3 components, where N is
  the number of atoms in the molecule
  """
  args = [iter(iterable)] * n
  return list(zip_longest(*args, fillvalue=fillvalue))

In [20]:
def positions_to_dict(raw_xyz):
  """
  Takes in a string of xyz atom positions and returns a dict of atom positions
  """
  # We use defaultdict dict subclass because instead of raising a KeyError provides 
  # a default value for the key that does not exist. In this case, we want a list-like
  # object
  data_dict = defaultdict(list) # Specify what are the value types

  iter_array =  [iter(raw_xyz.split()[1:])] * 4 # Multiplied by 4 because [atom name, x, y z]
  grouped_xyz = list(zip_longest(*iter_array, fillvalue=None))

  for i, element in enumerate(grouped_xyz):
    data_dict[element[0]].append(list(element[1:]))
  
  return dict(data_dict)

In [21]:
df_subset_clean['xyz_pbe_relaxed'] = df_subset_clean['xyz_pbe_relaxed'].apply(positions_to_dict)

In [22]:
df_subset_clean['xyz_pbe_relaxed'].iloc[1]

{'C': [['8.74281024', '13.44863575', '23.16378372'],
  ['10.11074978', '13.19356923', '23.08694135'],
  ['10.78160584', '11.71087995', '26.56899447'],
  ['10.09912389', '12.35237176', '25.36230584'],
  ['8.72448552', '12.61600039', '25.43519330'],
  ['8.05098811', '13.15651134', '24.34266310'],
  ['10.78568512', '12.64534155', '24.18069782'],
  ['11.67094309', '6.71025632', '26.43329001'],
  ['10.61726558', '6.69117798', '25.51596520'],
  ['10.31077367', '7.84372826', '24.79244796'],
  ['11.03704148', '9.01628435', '24.99640142'],
  ['12.07051104', '9.77255700', '29.68626470'],
  ['13.29606260', '9.28321813', '30.13860887'],
  ['14.23360836', '10.14988375', '30.70242936'],
  ['13.95141000', '11.51436592', '30.81173061'],
  ['12.73005466', '12.01524457', '30.36397898'],
  ['11.79679134', '11.13595322', '29.80757459'],
  ['12.28663726', '11.49056361', '26.40928961'],
  ['13.05277476', '12.67860849', '26.58351282'],
  ['12.87960329', '10.27851286', '26.18676214'],
  ['12.08815605', '9.053

In [23]:
df_subset_clean.head(2)

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,total_energy_pbe,mol
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,"{'O': [['39.55724515', '33.75271314', '38.4091...",-63908.261677,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432940>
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,"{'C': [['8.74281024', '13.44863575', '23.16378...",-47344.131203,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432a30>


### 2.4: Enriching the data with more features

In [24]:

from rdkit.Chem import Descriptors

df_subset_clean['tpsa'] = df_subset_clean['mol'].apply(lambda x: Descriptors.TPSA(x)) #https://en.wikipedia.org/wiki/Polar_surface_area
df_subset_clean['mol_w'] = df_subset_clean['mol'].apply(lambda x: Descriptors.ExactMolWt(x)) # https://en.wikipedia.org/wiki/Molecular_mass
df_subset_clean['num_valence_electrons'] = df_subset_clean['mol'].apply(lambda x: Descriptors.NumValenceElectrons(x)) # https://en.wikipedia.org/wiki/Valence_electron
df_subset_clean['num_heteroatoms'] = df_subset_clean['mol'].apply(lambda x: Descriptors.NumHeteroatoms(x))

In [25]:
df_subset_clean.head()

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,total_energy_pbe,mol,tpsa,mol_w,num_valence_electrons,num_heteroatoms
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,"{'O': [['39.55724515', '33.75271314', '38.4091...",-63908.261677,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432940>,85.09,423.047811,142,9
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,"{'C': [['8.74281024', '13.44863575', '23.16378...",-47344.131203,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432a30>,69.96,450.140199,162,5
2,LODZOT,n1ccc(cc1)c1nnc(o1)c1cccs1,"{'N': [['23.84904338', '42.50577669', '35.0388...",-28915.979909,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d4329e0>,51.81,229.030983,78,5
3,LUSREW,CC/C=C(/S(=O)(=O)c1ccc(cc1)C)\F,"{'C': [['36.52340453', '39.64067030', '25.1692...",-29310.5254,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d4328f0>,34.14,228.062029,82,4
4,NOMBEA01,OC(=O)c1cc(N(=O)=O)c(c(c1)N(=O)=O)C,"{'C': [['3.95256786', '17.67855833', '24.82877...",-23659.206412,<rdkit.Chem.rdchem.Mol object at 0x7f8c2d432a80>,123.58,226.022586,84,8


In [26]:
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D

import random

In [27]:
# Plotting some random molecules of the dataset
dopts = rdMolDraw2D.MolDrawOptions()
filename = 'mymolecules'

number_of_molecules = 6
to_draw = random.sample(range(0, len(df_subset_clean)), number_of_molecules)

mols = df_subset_clean["mol"][to_draw]
img = Draw.MolsToGridImage(mols, molsPerRow=3, useSVG=True,
                           legends=list(df["canonical_smiles"][to_draw].values),
                           drawOptions=dopts)

# Saving the file
if filename is not None:
    if not filename.endswith('.svg'):
        filename += '.svg'
    with open(filename, 'w') as f:
        f.write(img)

## Step 3: Deep Learning

### 3.1: Embedding SMILES for DL processing
TODO: Put this process in a preprocessing Keras layer

First, we're going to embed the molecules using a pre-trained model

In [28]:
import requests

url = 'https://github.com/samoturk/mol2vec_notebooks/raw/master/Notebooks/model_300dim.pkl'
file_to_download = requests.get(url, allow_redirects=True)

open('model_300dim.pkl', 'wb').write(file_to_download.content)

26567327

In [29]:
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec

In [30]:
#Loading pre-trained model via word2vec
from gensim.models import word2vec  # https://radimrehurek.com/gensim/index.html#install
w2vec_model = word2vec.Word2Vec.load('model_300dim.pkl') # Download from https://github.com/samoturk/mol2vec_notebo

In [31]:
#Constructing sentences
df_subset_clean['sentence'] = df_subset_clean.apply(lambda x: MolSentence(mol2alt_sentence(x['mol'], 1)), axis=1)

#Extracting embeddings to a numpy.array
#Note that we always should mark unseen='UNK' in sentence2vec() so that model is taught how to handle unknown substructures
df_subset_clean['mol2vec'] = [DfVec(x) for x in sentences2vec(df_subset_clean['sentence'], w2vec_model, unseen='UNK')]

In [32]:
df_subset_clean['mol2vec']=df_subset_clean['mol2vec'].apply(lambda x: x.vec)

In [33]:
df_subset_clean.columns.values.tolist()

['refcode_csd',
 'canonical_smiles',
 'xyz_pbe_relaxed',
 'total_energy_pbe',
 'mol',
 'tpsa',
 'mol_w',
 'num_valence_electrons',
 'num_heteroatoms',
 'sentence',
 'mol2vec']

In [34]:
final_dataset_columns=['refcode_csd',
 'canonical_smiles',
 'xyz_pbe_relaxed',
 'tpsa',
 'mol_w',
 'num_valence_electrons',
 'num_heteroatoms',
 'mol2vec',
'total_energy_pbe']

final_dataset = df_subset_clean[final_dataset_columns]
final_dataset

Unnamed: 0,refcode_csd,canonical_smiles,xyz_pbe_relaxed,tpsa,mol_w,num_valence_electrons,num_heteroatoms,mol2vec,total_energy_pbe
0,ABAFEQ,Cc1ccc(cc1)S(=O)(=O)N[C@H](c1nnc(o1)SCc1ccc(cc...,"{'O': [['39.55724515', '33.75271314', '38.4091...",85.09,423.047811,142,9,"[-9.885212, 7.283, -11.909391, 2.714392, -7.30...",-63908.261677
1,ABEDOC,N#CC(=C(c1ccccc1)c1ccccc1)[C@H](c1ccccc1)NS(=O...,"{'C': [['8.74281024', '13.44863575', '23.16378...",69.96,450.140199,162,5,"[-10.063227, 10.888024, -20.65452, 0.42244783,...",-47344.131203
2,LODZOT,n1ccc(cc1)c1nnc(o1)c1cccs1,"{'N': [['23.84904338', '42.50577669', '35.0388...",51.81,229.030983,78,5,"[-5.338256, 3.8927343, -6.896216, -0.27919152,...",-28915.979909
3,LUSREW,CC/C=C(/S(=O)(=O)c1ccc(cc1)C)\F,"{'C': [['36.52340453', '39.64067030', '25.1692...",34.14,228.062029,82,4,"[-4.85507, 5.27727, -9.770724, 2.1801903, -2.5...",-29310.525400
4,NOMBEA01,OC(=O)c1cc(N(=O)=O)c(c(c1)N(=O)=O)C,"{'C': [['3.95256786', '17.67855833', '24.82877...",123.58,226.022586,84,8,"[-3.7424397, 6.9630003, -7.5874896, 3.2785816,...",-23659.206412
...,...,...,...,...,...,...,...,...,...
61032,VIGDOE,COC(=O)c1c(c(C(=O)OC)c(c(c1C(C)(C)C)C(=O)OC)C(...,"{'C': [['22.26969809', '41.92804704', '35.4693...",78.90,420.251189,168,6,"[-11.613864, 11.018429, -15.99478, 4.44409, -5...",-37759.078750
61033,FOMSEG,O=C(c1ccccc1O)N/N=C/c1ccc(c(c1)Cl)Cl,"{'O': [['33.93699398', '43.81820335', '40.7349...",61.69,308.011933,102,6,"[-6.1590996, 6.9757314, -8.373731, 0.45949158,...",-46914.426056
61034,FOPCOC,C1=CC=C2C[C]([CH]1)[C]1[CH]C=CC=C(C1)CC2,"{'C': [['22.97323141', '39.02044458', '27.9314...",0.00,208.125201,80,0,"[-10.404796, 8.853079, -10.362222, -3.2704961,...",-16847.126386
61035,VINMOV,Cc1ccc(cc1)S(=O)(=O)N[C@H](C(=O)N1C(=O)OC[C@@H...,"{'O': [['56.17414946', '59.56569287', '49.1532...",92.78,576.012133,182,10,"[-10.337876, 9.518248, -17.89608, 0.178429, -7...",-134363.234235


# DFT calculations are expensive, so we would like to built a model capable of infering the molecule energy from their atom positions and species