In [1]:
!pip install selfies

Collecting selfies
  Downloading selfies-2.0.0-py3-none-any.whl (33 kB)
Installing collected packages: selfies
Successfully installed selfies-2.0.0


In [3]:
import selfies as sf

benzene = "c1ccccc1"
sf.encoder(benzene)

'[C][=C][C][=C][C][=C][Ring1][=Branch1]'

In [9]:
import selfies as sf

dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
alphabet = sf.get_alphabet_from_selfies(dataset)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))  # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

pad_to_len = max(sf.len_selfies(s) for s in dataset)  # 5
symbol_to_idx = {s: i for i, s in enumerate(alphabet)}

dimethyl_ether = dataset[0]  # [C][O][C]

label, one_hot = sf.selfies_to_encoding(
   selfies=dataset[1],
   vocab_stoi=symbol_to_idx,
   pad_to_len=pad_to_len,
   enc_type="both"
)

label

[2, 1, 2, 4, 4]

In [12]:
for x in sf.split_selfies("[C][O][C]"):
    print(x)

[C]
[O]
[C]


In [None]:
sf.selfies_to_encoding()

In [2]:
from wwf.utils import *
state_versions(['fastai', 'fastcore'])


---
This article is also a Jupyter Notebook available to be run from the top down. There
will be code snippets that you can then run in any environment.

Below are the versions of `fastai` and `fastcore` currently running at the time of writing this:
* `fastai` : 2.5.3 
* `fastcore` : 1.3.27 
---

In [3]:
from fastai.tabular.all import *

In [4]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [8]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
y_names = 'salary'
y_block = CategoryBlock()
splits = RandomSplitter()(range_of(df))

In [11]:
to = TabularPandas(df, procs = [Categorify, FillMissing, Normalize], cat_names=cat_names, cont_names=cont_names, 
                   splits=splits, y_names=['salary'], y_block=CategoryBlock())

In [13]:
dls = to.dataloaders(bs=1024)

In [15]:
learn = tabular_learner(dls, layers=[200,100], metrics=[accuracy])
learn.fit(5, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.468357,0.4338,0.784705,00:01
1,0.395662,0.364002,0.830006,00:01
2,0.371696,0.361474,0.829699,00:00
3,0.359759,0.357519,0.834152,00:00
4,0.352586,0.35961,0.836456,00:00


In [16]:
class ReadTabBatchIdentity(ItemTransform):
    "Read a batch of data and return the inputs as both `x` and `y`"
    def __init__(self, to): store_attr()

    def encodes(self, to):
        if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
        else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
        if to.device is not None: res = to_device(res, to.device)
        return res
    
class TabularPandasIdentity(TabularPandas): pass

In [17]:
@delegates()
class TabDataLoaderIdentity(TabDataLoader):
    "A transformed `DataLoader` for AutoEncoder problems with Tabular data"
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.iloc[b]

In [18]:
TabularPandasIdentity._dl_type = TabDataLoaderIdentity

In [19]:
to = TabularPandasIdentity(df, [Categorify, FillMissing, Normalize], cat_names, cont_names, splits=RandomSplitter(seed=32)(df))
dls = to.dataloaders(bs=1024)

Could not do one pass in your dataloader, there is something wrong in it


In [21]:
dls.n_inp = 2

In [22]:
total_cats = {k:len(v) for k,v in to.classes.items()}
total_cats

{'workclass': 10,
 'education': 17,
 'marital-status': 8,
 'occupation': 16,
 'relationship': 7,
 'race': 6,
 'education-num_na': 3}

In [23]:
means = pd.DataFrame.from_dict({k:[v] for k,v in to.means.items()})
stds = pd.DataFrame.from_dict({k:[v] for k,v in to.stds.items()})

In [24]:
low = (df[cont_names].min().to_frame().T.values - means.values) / stds.values
high = (df[cont_names].max().to_frame().T.values - means.values) / stds.values

In [25]:
class RecreatedLoss(Module):
    "Measures how well we have created the original tabular inputs"
    def __init__(self, cat_dict):
        ce = CrossEntropyLossFlat(reduction='sum')
        mse = MSELossFlat(reduction='sum')
        store_attr('cat_dict,ce,mse')

    def forward(self, preds, cat_targs, cont_targs):
        cats, conts = preds
        tot_ce, pos = cats.new([0]), 0
        for i, (k,v) in enumerate(self.cat_dict.items()):
            tot_ce += self.ce(cats[:, pos:pos+v], cat_targs[:,i])
            pos += v
        
        norm_cats = cats.new([len(self.cat_dict)])
        norm_conts = conts.new([conts.size(1)])
        cat_loss = tot_ce/norm_cats
        cont_loss = self.mse(conts, cont_targs)/norm_conts
        total = cat_loss+cont_loss

        return total / cats.size(0)

In [26]:
loss_func = RecreatedLoss(total_cats)

In [27]:
class BatchSwapNoise(Module):
    "Swap Noise Module"
    def __init__(self, p): store_attr()

    def forward(self, x):
        if self.training:
            mask = torch.rand(x.size()) > (1 - self.p)
            l1 = torch.floor(torch.rand(x.size()) * x.size(0)).type(torch.LongTensor)
            l2 = (mask.type(torch.LongTensor) * x.size(1))
            res = (l1 * l2).view(-1)
            idx = torch.arange(x.nelement()) + res
            idx[idx>=x.nelement()] = idx[idx>=x.nelement()]-x.nelement()
            return x.flatten()[idx].view(x.size())
        else:
            return x

In [28]:
class TabularAE(TabularModel):
    "A simple AutoEncoder model"
    def __init__(self, emb_szs, n_cont, hidden_size, cats, low, high, ps=0.2, embed_p=0.01, bswap=None):
        super().__init__(emb_szs, n_cont, layers=[1024, 512, 256], out_sz=hidden_size, embed_p=embed_p, act_cls=Mish())
        
        self.bswap = bswap
        self.cats = cats
        self.activation_cats = sum([v for k,v in cats.items()])
        
        self.layers = nn.Sequential(*L(self.layers.children())[:-1] + nn.Sequential(LinBnDrop(256, hidden_size, p=ps, act=Mish())))
        
        if(bswap != None): self.noise = BatchSwapNoise(bswap)
        self.decoder = nn.Sequential(
            LinBnDrop(hidden_size, 256, p=ps, act=Mish()),
            LinBnDrop(256, 512, p=ps, act=Mish()),
            LinBnDrop(512, 1024, p=ps, act=Mish())
        )
        
        self.decoder_cont = nn.Sequential(
            LinBnDrop(1024, n_cont, p=ps, bn=False, act=None),
            SigmoidRange(low=low, high=high)
        )
        
        self.decoder_cat = LinBnDrop(1024, self.activation_cats, p=ps, bn=False, act=None)
        
    def forward(self, x_cat, x_cont=None, encode=False):
        if(self.bswap != None):
            x_cat = self.noise(x_cat)
            x_cont = self.noise(x_cont)
        encoded = super().forward(x_cat, x_cont)
        if encode: return encoded # return the representation
        decoded_trunk = self.decoder(encoded)
        decoded_cats = self.decoder_cat(decoded_trunk)
        decoded_conts = self.decoder_cont(decoded_trunk)
        return decoded_cats, decoded_conts

In [29]:
emb_szs = get_emb_sz(to.train)

In [31]:
model = TabularAE(emb_szs, len(cont_names), 128, ps=0.1, cats=total_cats, embed_p=0.01,
                  bswap=.1, low=tensor(low).cpu(), high=tensor(high).cpu())

In [32]:
learn = Learner(dls, model, loss_func=loss_func, wd=0.01, opt_func=ranger)

In [33]:
learn.fit_flat_cos(100, cbs=[EarlyStoppingCallback()], lr=4e-3)

epoch,train_loss,valid_loss,time
0,2.904803,1.844863,00:10
1,2.020573,1.086404,00:11
2,1.664814,0.957699,00:12
3,1.44985,0.825232,00:10
4,1.292033,0.748009,00:11
5,1.192967,0.70638,00:10
6,1.102586,0.631237,00:11
7,1.011222,0.502837,00:11
8,0.913575,0.37944,00:11
9,0.82444,0.279669,00:11


No improvement since epoch 11: early stopping


In [34]:
dl = learn.dls.test_dl(df)

Could not do one pass in your dataloader, there is something wrong in it


In [None]:
outs = []
for batch in dl:
    with torch.no_grad():
        learn.model.eval()
        learn.model.cpu()
        out = learn.model(*batch[:2], encode=True).cpu().numpy()
        outs.append(out)
outs = np.concatenate(outs)