In [1]:
import sys
sys.path.append('..')
from utils.dataset import MoleculeDataset, fpolyv2_collate_fn, to_tensor
from utils.data import stratified_train_test_split, DataScaler, CrossValidation
from utils.trainer import Trainer
from utils.runner import run
from utils.params import Parameters
from model.models import DualHeadModel
import torch, gc, os
from torch.utils.data import DataLoader
from torch.optim import AdamW
import numpy as np
from argparse import ArgumentParser

os.chdir('..')
p = Parameters(fn='concat/mult_add_cg.json', default='defaults_r100_n3c02.json', root='./params_sc')
p.tag = p.tag.replace('mult','dual')

dataset = MoleculeDataset(norm=p.normalize_feature)
dataset.generate_fpolyv2(path=p.data_path, col_target=p.target_column)

data = to_tensor(dataset.data, device=p.device)
stratum = np.array([len(d['F'])+len(d['C']) for d in data])
if hasattr(p, 'train_ratio') and isinstance(p.train_ratio, float):
    train_data_, test_data = stratified_train_test_split(data, stratum,
                                train_ratio=p.train_ratio, seed=p.random_state)    
elif hasattr(p, 'train_monomer'):
    remove_monomer = getattr(p, 'remove_monomer')
    train_data_, test_data = stratified_train_test_split(data, stratum,
                                train_stratum=p.train_monomer, remove_stratum=remove_monomer)

train_stratum = np.array([len(d['F'])+len(d['C']) for d in train_data_])
scaler = DataScaler(device=p.device)


  Overwriting attribute : norm
  Overwriting attribute : data
  Overwriting attribute : atom_feat_name
  Overwriting attribute : bond_feat_name
  Overwriting attribute : mol_feat_name
  Overwriting attribute : targets
  Overwriting attribute : tag
  Overwriting attribute : cache_fn


In [2]:

if p.cross_valid:
    k_fold = CrossValidation(n_fold=p.num_fold, data=train_data_, 
                             stratum=train_stratum, seed=p.random_state)
    p.num_repeat = p.num_fold

p.graph_net_params.update({
    'node_dim':dataset.num_atom_feat,
    'edge_dim':dataset.num_bond_feat,
})

p.mol_net_params.update({
    'input_dim':dataset.num_mol_feat,
})

p.decoder_params['output_dim'] = dataset.num_target

encoder_params = {
    'graph_net_params': p.graph_net_params,
    'mol_net_params': p.mol_net_params,
}


In [3]:

if p.cross_valid:
    train_data, valid_data = k_fold[0]
else:
    if hasattr(p, 'valid_ratio'):
        train_data, valid_data = stratified_train_test_split(train_data_, train_stratum,
                                        test_ratio=p.valid_ratio, seed=p.random_state)
    else:
        valid_data = test_data

scaler.train(train_data, collate_fn=fpolyv2_collate_fn)

scaled_train_data = scaler.scale_data(train_data)
scaled_valid_data = scaler.scale_data(valid_data)
scaled_test_data = scaler.scale_data(test_data)

train_dl = DataLoader(scaled_train_data, batch_size=p.batch_size, shuffle=True, collate_fn=fpolyv2_collate_fn)
valid_dl = DataLoader(scaled_valid_data, batch_size=512, collate_fn=fpolyv2_collate_fn)
test_dl  = DataLoader(scaled_test_data, batch_size=512, collate_fn=fpolyv2_collate_fn)

model = DualHeadModel(encoder_type=p.encoder_type, encoder_params=encoder_params, 
                        decoder_params=p.decoder_params, pooling=p.encoder_readout, 
                        shared=p.encoder_share)

model.to(p.device)

opt = AdamW(model.parameters(), lr=p.learning_rate)

In [4]:
for i in range(500):
    for batch in train_dl:
        feat, target, info = batch
 #       print(feat['feat_f']['mol_feat'].shape)
        pred = model(**feat)

In [7]:
train_smiles_f = [d['F']['smiles'] for d in train_data_]

TypeError: list indices must be integers or slices, not str

In [66]:
import pandas as pd
import numpy as np
df = pd.read_csv('./dataset/fpolymers_221123.csv')
df = df[~df.TG.isna()]
df[[f'FR_{x}' for x in 'ABCDE']] = df[[f'FR_{x}' for x in 'ABCDE']].fillna(0)
ws = df[[f'FR_{x}' for x in 'ABCDE']]
smiles_dict = {}
for i in range(1,6):
    m1 = i == np.sum(ws != 0, axis=1)
    m2 = (ws[m1] != 0).values
    ss = df.loc[m1, [f'SMILES_{x}' for x in 'ABCDE']]
    mf = ss.apply(lambda x: 'F' in x)
    for s, m in zip(ss.values, m2):
        print(i, '\t', np.sum(['F' in _s for _s in s[m]]), np.sum(['F' not in _s for _s in s[m]]), )
#    smiles_dict[i] = {'f':[s for s in ss if 'F' in s], 'c':[s for s in ss if 'F' not in s]}
    print()

1 	 0 1
1 	 0 1
1 	 0 1
1 	 1 0
1 	 1 0
1 	 1 0
1 	 1 0
1 	 0 1
1 	 0 1

2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 0 2
2 	 0 2
2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 1 1
2 	 0 2
2 	 0 2
2 	 1 1

3 	 1 2
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 1 2
3 	 0 3
3 	 0 3
3 	 0 3
3 	 0 3
3 	 1 2
3 	 1 2
3 	 1 2
3 	 0 3
3 	 0 3
3 	 1 2
3 	 1 2

4 	 1 3
4 	 1

In [80]:
f_mask = df[[f'SMILES_{x}' for x in 'ABCDE']].apply(lambda x: ['F' in _x for _x in x])
w_mask = df[[f'FR_{x}' for x in 'ABCDE']].fillna(0) != 0

In [92]:
cf = np.sum(f_mask.values & w_mask.values, axis=1)
cc = np.sum(~f_mask.values & w_mask.values, axis=1)
ca = np.sum(w_mask.values, axis=1)

In [98]:
ca[cf == 0]

array([4, 4, 1, 1, 1, 4, 4, 1, 1, 3, 2, 3, 3, 3, 2, 3, 4, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 3, 3, 4, 2])