#### Concern: elements that appear only a handful of times

The FreeSolv database contains a few elements that appear in just a handful of molecules, and may appear as outliers. It might be a good idea to generate a series of mini-freesolvs, formed by filtering by common elements.

For example, {C, H, O} covers about 45% of the dataset and {C, H, N, O, S, Cl} covers about 90% of the dataset. Bromine, fluorine, phosphorus, and iodine appear just a few times, and compounds that contain these elements may be appear as outliers...

In [1]:
import pandas as pd

df = pd.read_hdf('freesolv.h5')



In [2]:
elements = dict()
for key in df.index:
    elements[key] = set([a.element.symbol for a in df['offmol'][key].atoms])

In [3]:
all_elements = set()
for e in elements.values():
    all_elements.update(e)
all_elements

{'Br', 'C', 'Cl', 'F', 'H', 'I', 'N', 'O', 'P', 'S'}

#### Sort elements by frequency

In [4]:
in_descending_order_of_popularity = sorted(all_elements, key=lambda i : - sum([i in e for e in elements.values()]))

In [5]:
print('element'.ljust(10), '# of molecules containing it')
for element in in_descending_order_of_popularity:
    print(element.ljust(10), sum([element in e for e in elements.values()]))

element    # of molecules containing it
C          639
H          629
O          344
N          169
Cl         114
S          40
F          35
Br         25
P          14
I          12


In [6]:
from typing import Set, List
def form_mini_freesolv(allowed_elements: Set[str]) -> List[str]:
    mini_freesolv = []
    for key in df.index:
        offmol = df['offmol'][key]
        if set([a.element.symbol for a in offmol.atoms]).issubset(allowed_elements):
            mini_freesolv.append(key)
    return mini_freesolv

In [7]:
len(form_mini_freesolv({'C', 'H', 'O'}))

300

In [8]:
print('elements'.ljust(40), '# molecules'.ljust(15), 'coverage')
mini_free_solvs = dict()
for i in range(1, len(in_descending_order_of_popularity)):
    element_subset = set(in_descending_order_of_popularity[:i+1])
    name = ', '.join(sorted(element_subset))
    
    mini_free_solvs[name] = form_mini_freesolv(element_subset)
    elems = '{' + name + '}'
    n_molecules = f'{len(mini_free_solvs[name])}'
    coverage = f'{(100 * len(mini_free_solvs[name]) / len(df)):.1f}%'
    
    print(elems.ljust(40), n_molecules.ljust(15), coverage)

elements                                 # molecules     coverage
{C, H}                                   103             16.0%
{C, H, O}                                300             46.7%
{C, H, N, O}                             431             67.1%
{C, Cl, H, N, O}                         529             82.4%
{C, Cl, H, N, O, S}                      559             87.1%
{C, Cl, F, H, N, O, S}                   591             92.1%
{Br, C, Cl, F, H, N, O, S}               616             96.0%
{Br, C, Cl, F, H, N, O, P, S}            630             98.1%
{Br, C, Cl, F, H, I, N, O, P, S}         642             100.0%
