# Transformer

Application of transformers for battery properties prediction

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict

import re
import ast

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [8]:
df = pd.read_csv('/content/drive/MyDrive/clean_df.csv')
df['parsed_name'] = df['parsed_name'].apply(ast.literal_eval)
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Extracted_name,Capacity,Conductivity,Coulombic Efficiency,Energy,Voltage,parsed_name,elements_cnt,...,Te_fraction,Ir_fraction,Cr_fraction,Ba_fraction,Cs_fraction,La_fraction,As_fraction,Am_fraction,Fe_fraction,Au_fraction
0,14,( 90PEO:10Zn(CF3SO3)2 ) + 5 ZnO,"[{'C': '2.0', 'F': '6.0', 'S': '2.0', 'O': '6....",,1.8e-05,,,,"[{'C': '2.0', 'F': '6.0', 'S': '2.0', 'O': '6....",1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36,"(5,5′-bisphenylethynyl-2,2′-bipyridyl)Re(CO)3Cl","[{'C': '3.0', 'O': '3.0', 'Re': '1.0', 'Cl': '...",,,,,1.95,"[{'C': '3.0', 'O': '3.0', 'Re': '1.0', 'Cl': '...",1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,(BaCa)(ZrTi)O3,"[{'Ba': '1.0', 'Ca': '1.0', 'Zr': '1.0', 'Ti':...",,,,,3.2,"[{'Ba': '1.0', 'Ca': '1.0', 'Zr': '1.0', 'Ti':...",1,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40,(BiO)2CO3,"[{'Bi': '2.0', 'O': '5.0', 'C': '1.0'}]",,,37.5,732.865792,,"[{'Bi': '2.0', 'O': '5.0', 'C': '1.0'}]",1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,42,(BiO)4CO3(OH)2,"[{'Bi': '4.0', 'O': '9.0', 'H': '2.0', 'C': '1...",210.0,,,,1.1,"[{'Bi': '4.0', 'O': '9.0', 'H': '2.0', 'C': '1...",1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Name tokenization

Idea: turn the name string into a list of tokens based on which we will calculate the embedding for the transformer model

Example: "(BiO)4CO3(OH)2formula" → ["(", "Bi", "O", ")", "4", "C", "O", "3", "(", "OH", ")", "2"]


First, lets get all the unique element names from our formulas

In [9]:
# Extracting all unique elements:
unique_elements = set()

for value in df['parsed_name']:
    for composition in value:
        unique_elements.update(composition.keys())

# Show all unique elements found
print(f"Unique elements identified ({len(unique_elements)} total):")
print(unique_elements)

Unique elements identified (88 total):
{'S', 'Bi', 'Al', 'U', 'Cr', 'Rb', 'Hs', 'Re', 'Pa', 'N', 'W', 'Sm', 'Sc', 'Nb', 'In', 'Ru', 'Ni', 'Hg', 'Ir', 'Yb', 'Cs', 'C', 'La', 'Pt', 'Te', 'Sg', 'Pr', 'Na', 'P', 'Nd', 'Be', 'Sr', 'O', 'Zn', 'Ar', 'Po', 'K', 'Gd', 'Br', 'Er', 'Li', 'Y', 'Cm', 'As', 'H', 'Ga', 'Am', 'Ca', 'V', 'Ac', 'At', 'Ce', 'Cd', 'Os', 'I', 'Hf', 'Si', 'Ta', 'Ge', 'Pu', 'Pm', 'Au', 'Ho', 'Ti', 'B', 'Dy', 'Cl', 'Ba', 'Ra', 'Cn', 'Mn', 'Cu', 'Sn', 'Se', 'Mo', 'Es', 'Pb', 'Fe', 'Pd', 'Ag', 'Zr', 'Co', 'Tb', 'F', 'Rf', 'Rh', 'Mg', 'Sb'}


In [10]:
def tokenize_formula(formula):
    # Pattern to match element symbols, numbers, and parentheses
    token_pattern = r'(\()|(\))|([A-Z][a-z]?)|(\d+)'

    # Compile the pattern
    token_re = re.compile(token_pattern)

    tokens = []
    idx = 0

    while idx < len(formula):
        match = token_re.match(formula, idx)

        if match:
            token = match.group()
            # Check if it's a valid element, digit, or parentheses
            if token in ('(', ')') or token.isdigit() or token in unique_elements:
                tokens.append(token)
            # Else it's an invalid chemical token and skipped
            idx = match.end()
        else:
            # Skip unrelated characters
            idx += 1

    return tokens

In [11]:
df['tokenized_formula'] = df['Name'].apply(tokenize_formula)

Gather all unique tokens

In [12]:
# Extracting all unique tokens:
unique_tokens = defaultdict(int)

for value in df['tokenized_formula']:
    for token in value:
        unique_tokens[token] += 1

# Show all unique tokens found
print(f"Unique tokens identified ({len(unique_tokens)} total):")

top_tokens = sorted(unique_tokens.items(), key=lambda x: x[1], reverse=True)
top_tokens[:10]

Unique tokens identified (263 total):


[('O', 1591),
 ('2', 1500),
 ('4', 865),
 ('3', 767),
 ('C', 713),
 ('Li', 687),
 ('P', 605),
 ('(', 594),
 (')', 585),
 ('H', 497)]

In [13]:
token2id = {token: num + 1 for num, token in enumerate(unique_tokens.keys())}
token2id['<pad>'] = 0

id2token = {index: token for token, index in token2id.items()}

In [14]:
def encode_sequence(seq):
    return [token2id[token] for token in seq]

def decode_sequence(seq):
    return [id2token[index] for index in seq]

## Transformer model for Voltage prediction

In [22]:
target = 'Voltage'
filtered_df = df[~df[target].isna()]

X, y = filtered_df['tokenized_formula'], filtered_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model definition

In [23]:
class ChemTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, output_dim, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=num_layers)
        self.regressor = nn.Linear(embed_dim, output_dim)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.transformer(x, src_key_padding_mask=mask)
        x = x.mean(dim=1)  # Mean pooling over sequence length
        return self.output_layer(x)

# # Example instantiation
# model = ChemTransformer(
#     vocab_size=len(token2id),
#     embed_dim=128,
#     num_heads=4,
#     num_layers=3,
#     output_dim=1  # for Capacity, Conductivity, Efficiency, Voltage, Energy
# )

In [24]:
model = ChemTransformer(vocab_size=264, embed_dim=128, num_heads=4, num_layers=3, output_dim=1);

In [26]:
class ChemicalDataset(Dataset):
    def __init__(self, formulas, targets, vocab):
        self.formulas = formulas
        self.targets = targets
        self.vocab = vocab

    def encode(self, formula):
        return torch.tensor([self.vocab[tok] for tok in formula])

    def __len__(self):
        return len(self.formulas)

    def __getitem__(self, idx):
        x = torch.tensor(self.encode(self.formulas[idx]))
        y = torch.tensor(self.targets[idx]).float()
        return x, y

# Collate function to pad sequences
# def collate_fn(batch):
#     x_batch, y_batch = zip(*batch)
#     x_batch_padded = nn.utils.rnn.pad_sequence(x_batch, batch_first=True, padding_value=0)
#     y_batch_tensor = torch.stack(y_batch)
#     return x_batch_padded, y_batch_tensor

# Usage
dataset = ChemicalDataset(X, y, token2id)
loader = DataLoader(dataset, batch_size=16, shuffle=True)


In [34]:
loader.dataset[2]

  x = torch.tensor(self.encode(self.formulas[idx]))


(tensor([ 1, 16, 17, 11,  1, 18, 19, 11,  4,  9]), tensor(3.2000))

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()
epochs = 50

model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch_x, batch_y in loader:
        optimizer.zero_grad()
        mask = (batch_x == 0)  # padding mask
        predictions = model(batch_x, mask=mask)
        loss = criterion(predictions, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/len(loader):.4f}")


  x = torch.tensor(self.encode(self.formulas[idx]))


KeyError: 534