In [5]:
import pandas as pd
import numpy as np

# Define special tokens
SPECIAL_TOKENS = {
    "<PAD>": 0,
    "<SOS>": 1,
    "<EOS>": 2
}

# Load the CSV file
df = pd.read_csv('perovskite_database_query.csv')

# Inspect the first few rows
# print("Initial DataFrame:")
# print(df.head())

# Check for missing values in relevant columns
# print("\nMissing values in 'Cell_stack_sequence':", df['Cell_stack_sequence'].isna().sum())
# print("Missing values in 'Perovskite_composition_long_form':", df['Perovskite_composition_long_form'].isna().sum())

# Handle missing 'Cell_stack_sequence'
# Option 1: Remove rows with missing sequences
# df = df.dropna(subset=['Cell_stack_sequence']).reset_index(drop=True)

# Option 2: Fill missing sequences with '<PAD>'
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].fillna('<PAD>')

# Handle missing 'Perovskite_composition_long_form'
# Option 1: Remove rows with missing compositions
# df = df.dropna(subset=['Perovskite_composition_long_form']).reset_index(drop=True)

# Option 2: Fill missing compositions with 'Perovskite' to leave the sequence unchanged
df['Perovskite_composition_long_form'] = df['Perovskite_composition_long_form'].fillna('Perovskite')

# Split the 'Cell_stack_sequence' into lists of layers
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].apply(lambda x: [layer.strip() for layer in x.split('|')])

# Function to build layer_to_id dictionary
def build_layer_to_id(sequences, special_tokens):
    unique_layers = set()
    for seq in sequences:
        unique_layers.update(seq)
    
    # Remove special tokens if present in unique_layers
    unique_layers -= set(special_tokens.keys())
    
    # Sort the layers for consistency
    unique_layers = sorted(list(unique_layers))
    
    # Assign unique IDs starting after special tokens
    layer_to_id = {token: idx for token, idx in special_tokens.items()}
    current_id = len(special_tokens)
    
    for layer in unique_layers:
        if layer not in layer_to_id:
            layer_to_id[layer] = current_id
            current_id += 1
    
    return layer_to_id

# Build the dictionary using the 'Cell_stack_sequence' column
layer_to_id = build_layer_to_id(df['Cell_stack_sequence'].tolist(), SPECIAL_TOKENS)
id_to_layer = {v: k for k, v in layer_to_id.items()}
vocab_size = len(layer_to_id)

print("\nLayer to ID Mapping:")
for layer, idx in layer_to_id.items():
    print(f"{layer}: {idx}")

# Define a function to replace 'Perovskite' with its composition
def replace_perovskite_layer(sequence, composition):
    return [composition if layer == 'Perovskite' else layer for layer in sequence]

# Apply the replacement to create a new column
df['Cell_stack_sequence_enhanced'] = df.apply(
    lambda row: replace_perovskite_layer(row['Cell_stack_sequence'], row['Perovskite_composition_long_form']),
    axis=1
)

# Join the list back into a string with ' | ' separator
df['Cell_stack_sequence_enhanced'] = df['Cell_stack_sequence_enhanced'].apply(lambda x: ' | '.join(x))

# Display a sample of the new column
print("\nSample Enhanced Sequences:")
# print(df[['Cell_stack_sequence_enhanced']].head())
print(df['Cell_stack_sequence_enhanced'])

# Save the enhanced DataFrame to a new CSV
df.to_csv('perovskite_database_query_enhanced.csv', index=False)

print("\nEnhanced CSV has been saved as 'perovskite_database_query_enhanced.csv'.")



  df = pd.read_csv('perovskite_database_query.csv')



Layer to ID Mapping:
<PAD>: 0
<SOS>: 1
<EOS>: 2
(1,6-di{3-[2-(4- methylphenyl)vinyl]carbazol-9-yl}hexane: 3
(10-butyl-3,7-diphenylphenoxazine): 4
(2Z,2'Z)-2,2'-(((2,4-dimethylphenyl) azanediyl) bis([1,1'-biphenyl]-4',4-diyl)) bis(3-(4-(diphenylamino) phenyl) acrylonitrile: 5
(2Z,2'Z)-2,2'-((10-(2-ethylhexyl)-10H-phenothiazine-3,7-diyl) bis(4,1- phenylene)) bis(3-(4-(diphenylamino) phenyl) acrylonitrile: 6
(2Z,2′Z)-3,3′- (5,5′-(2,7-dioctyl-1,3,6,8-tetraoxo-1,2,3,6,7,8-hexahydrobenzo [lmn][3,8]phenanthroline-4,9-diyl)bis (thiophene-5,2-diyl))bis(2-(4-(trifluoromethyl)phenyl) acrylonitrile): 7
(2Z,2′Z)-3,3′-(5,5′-(2,7-dioctyl-1,3,6,8-tetraoxo-1,2,3,6,7,8-hexahydrobenzo[lmn][3,8] phenanthroline-4,9-diyl)bis(thiophene-5,2-diyl))bis(2-(3,5-bis (trifluoroomethyl)phenyl) acrylonitrile): 8
(3-Aminopropyl)trimethoxysilane: 9
(4AMP)I2: 10
(BMPA-EDOT)3-TPA: 11
(CH3)3SPbI3: 12
(DTYM-NDI-DTYA)2: 13
(EMIM)PF6: 14
(N2,N2,N2',N2',N7,N7,N7',N7'-octakis(4-methoxyphenyl)spiro[fluorene-9,9'-xanthene]-2,2'

# The best code:

In [35]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import copy

# ----------------------------
# 1. Define Special Tokens
# ----------------------------
SPECIAL_TOKENS = {
    "<PAD>": 0,
    "<SOS>": 1,
    "<EOS>": 2
}

# ----------------------------
# 2. Load and Inspect Data
# ----------------------------
# Load the CSV file
df = pd.read_csv('perovskite_database_query.csv')

# # Display the first few rows
# print("Initial DataFrame:")
# print(df.head())

# Check for missing values in relevant columns
# print("\nMissing values in 'Cell_stack_sequence':", df['Cell_stack_sequence'].isna().sum())
# print("Missing values in 'Perovskite_composition_long_form':", df['Perovskite_composition_long_form'].isna().sum())

# Handle missing 'Cell_stack_sequence'
# Option 1: Remove rows with missing sequences
# df = df.dropna(subset=['Cell_stack_sequence']).reset_index(drop=True)

# Option 2: Fill missing sequences with '<PAD>'
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].fillna('<PAD>')

# Handle missing 'Perovskite_composition_long_form'
# Option 1: Remove rows with missing compositions
# df = df.dropna(subset=['Perovskite_composition_long_form']).reset_index(drop=True)

# Option 2: Fill missing compositions with 'Perovskite' to leave the sequence unchanged
df['Perovskite_composition_long_form'] = df['Perovskite_composition_long_form'].fillna('Perovskite')

# ----------------------------
# 3. Enhance 'Cell_stack_sequence'
# ----------------------------
def replace_perovskite_layer(sequence, composition):
    """
    Replaces all occurrences of 'Perovskite' in the sequence with the provided composition.

    Args:
        sequence (List[str]): List of layer tokens.
        composition (str): The chemical/structural composition to replace 'Perovskite'.

    Returns:
        List[str]: Modified sequence with 'Perovskite' replaced.
    """
    return [composition if layer == 'Perovskite' else layer for layer in sequence]

# Split the 'Cell_stack_sequence' into lists of layers
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].apply(lambda x: [layer.strip() for layer in x.split('|')])

# Apply the replacement to create a new column
df['Cell_stack_sequence_enhanced'] = df.apply(
    lambda row: replace_perovskite_layer(row['Cell_stack_sequence'], row['Perovskite_composition_long_form']),
    axis=1
)

# Join the list back into a string with ' | ' separator
df['Cell_stack_sequence_enhanced'] = df['Cell_stack_sequence_enhanced'].apply(lambda x: ' | '.join(x))

# # Display a sample of the new column
# print("\nSample Enhanced Sequences:")
# print(df[['Cell_stack_sequence_enhanced']].head())

# ----------------------------
# 4. Build layer_to_id Dictionary from Enhanced Sequences
# ----------------------------
def build_layer_to_id(sequences, special_tokens):
    """
    Builds a dictionary mapping each unique layer to a unique integer ID.

    Args:
        sequences (List[str]): List of layer sequences as strings.
        special_tokens (dict): Dictionary of special tokens and their IDs.

    Returns:
        layer_to_id (dict): Mapping from layer names to unique IDs.
    """
    unique_layers = set()
    for seq in sequences:
        layers = [layer.strip() for layer in seq.split('|')]
        unique_layers.update(layers)
    
    # Remove special tokens if present in unique_layers
    unique_layers -= set(special_tokens.keys())
    
    # Sort the layers for consistency
    unique_layers = sorted(list(unique_layers))
    
    # Assign unique IDs starting after special tokens
    layer_to_id = {token: idx for token, idx in special_tokens.items()}
    current_id = len(special_tokens)
    
    for layer in unique_layers:
        if layer not in layer_to_id:
            layer_to_id[layer] = current_id
            current_id += 1
    
    return layer_to_id

# Build the dictionary using the 'Cell_stack_sequence_enhanced' column
layer_to_id = build_layer_to_id(df['Cell_stack_sequence_enhanced'].tolist(), SPECIAL_TOKENS)
id_to_layer = {v: k for k, v in layer_to_id.items()}
vocab_size = len(layer_to_id)

# print("\nLayer to ID Mapping:")
# for layer, idx in layer_to_id.items():
#     print(f"{layer}: {idx}")

# ----------------------------
# 5. Define Layer Features
# ----------------------------
feature_dim = 5  # Example feature dimensionality

# Assign a feature vector to each layer
np.random.seed(42)  # For reproducibility
layer_features = {}
for layer in layer_to_id.keys():
    if layer in ["<PAD>", "<SOS>", "<EOS>"]:
        layer_features[layer] = [0.0] * feature_dim
    else:
        # Replace with actual descriptors as needed
        layer_features[layer] = np.random.rand(feature_dim).tolist()

# print("\nLayer Features:")
# for layer, features in list(layer_features.items())[:100]:  # Display first 5 for brevity
#     print(f"{layer}: {features}")

# ----------------------------
# 6. Define the Dataset Class
# ----------------------------
class PerovskiteDataset(Dataset):
    def __init__(self, sequences, layer_to_id, layer_features, max_len=None):
        """
        Args:
            sequences (List[str]): List of enhanced layer sequences as strings.
            layer_to_id (dict): Mapping from layer names to integer IDs.
            layer_features (dict): Mapping from layer names to feature vectors.
            max_len (int, optional): Maximum sequence length. If None, uses the longest sequence.
        """
        self.sequences = sequences
        self.layer_to_id = layer_to_id
        self.layer_features = layer_features
        if not max_len:
            self.max_len = max(len(seq.split('|')) + 2 for seq in self.sequences)  # +2 for <SOS> and <EOS>
        else:
            self.max_len = max_len
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        # Get the sequence and split into layers
        seq = self.sequences[idx]
        layers = [layer.strip() for layer in seq.split('|')]
        
        # Convert layers to IDs
        seq_ids = [self.layer_to_id.get(layer, self.layer_to_id["<PAD>"]) for layer in layers]
        
        # Add <SOS> and <EOS> if not already present
        if seq_ids[0] != self.layer_to_id["<SOS>"]:
            seq_ids = [self.layer_to_id["<SOS>"]] + seq_ids
        if seq_ids[-1] != self.layer_to_id["<EOS>"]:
            seq_ids = seq_ids + [self.layer_to_id["<EOS>"]]
        
        # Update max_len if necessary
        if len(seq_ids) > self.max_len:
            self.max_len = len(seq_ids)
        
        # Pad sequences
        padded_seq_ids = seq_ids + [self.layer_to_id["<PAD>"]] * (self.max_len - len(seq_ids))
        
        # Create input and target sequences
        input_ids = padded_seq_ids[:-1]
        target_ids = padded_seq_ids[1:]
        
        # Extract and pad features
        features = [self.layer_features[id_to_layer[id]] for id in input_ids]
        padded_features = features + [[0.0] * feature_dim] * (self.max_len - 1 - len(features))
        
        # Convert to tensors
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        target_ids = torch.tensor(target_ids, dtype=torch.long)
        features = torch.tensor(padded_features, dtype=torch.float)
        
        return {
            "input_ids": input_ids,
            "target_ids": target_ids,
            "features": features
        }

# ----------------------------
# 7. Create DataLoader
# ----------------------------
# Initialize the dataset
dataset = PerovskiteDataset(
    sequences=df['Cell_stack_sequence_enhanced'].tolist(),
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

# Define DataLoader parameters
batch_size = 32
shuffle = True

# Create DataLoader with num_workers=0 to avoid multiprocessing issues
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True  # If using GPU
)

# ----------------------------
# 8. Define the Transformer-Based Model
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, feature_dim, nhead, num_encoder_layers, dim_feedforward, dropout=0.1, pad_idx=0):
        super(TransformerModel, self).__init__()
        
        # Token Embeddings
        self.token_embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # Positional Encoding initialized with embed_dim + 32
        self.pos_encoder = PositionalEncoding(embed_dim + 32)
        
        # Feature MLP
        self.feature_mlp = nn.Sequential(
            nn.Linear(feature_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32)
        )
        
        # Transformer Encoder with d_model = embed_dim + 32
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embed_dim + 32,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)
        
        # Output Layer
        self.fc_out = nn.Linear(embed_dim + 32, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, features, src_key_padding_mask):
        """
        Args:
            input_ids: Tensor of shape (batch_size, seq_len)
            features: Tensor of shape (batch_size, seq_len, feature_dim)
            src_key_padding_mask: Tensor of shape (batch_size, seq_len), True for padding tokens
        Returns:
            logits: Tensor of shape (batch_size, seq_len, vocab_size)
        """
        # Embed tokens
        token_embeds = self.token_embedding(input_ids)  # (batch, seq, embed_dim)
        
        # Process features
        feature_embeds = self.feature_mlp(features)  # (batch, seq, 32)
        
        # Concatenate embeddings
        combined = torch.cat((token_embeds, feature_embeds), dim=2)  # (batch, seq, embed_dim + 32)
        
        # Add positional encoding
        combined = self.pos_encoder(combined)  # (batch, seq, embed_dim + 32)
        
        # Apply dropout
        combined = self.dropout(combined)
        
        # Prepare for Transformer: transpose to (seq, batch, embed_dim + 32)
        combined = combined.transpose(0, 1)  # (seq, batch, embed_dim + 32)
        
        # Pass through Transformer Encoder
        encoded = self.transformer_encoder(combined, src_key_padding_mask=src_key_padding_mask)  # (seq, batch, embed_dim + 32)
        
        # Transpose back to (batch, seq, embed_dim + 32)
        encoded = encoded.transpose(0, 1)  # (batch, seq, embed_dim + 32)
        
        # Output layer
        logits = self.fc_out(encoded)  # (batch, seq, vocab_size)
        
        return logits

# ----------------------------
# 9. Initialize Model, Loss, Optimizer
# ----------------------------
embed_dim = 128
nhead = 8
num_encoder_layers = 4
dim_feedforward = 256
dropout = 0.1
learning_rate = 1e-4
pad_idx = layer_to_id["<PAD>"]
num_epochs = 30

model = TransformerModel(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    feature_dim=feature_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    pad_idx=pad_idx
)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ----------------------------
# 10. Training Loop with Validation and Early Stopping
# ----------------------------
# Split the dataset into training and validation sets
train_sequences, val_sequences = train_test_split(
    df['Cell_stack_sequence_enhanced'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create separate datasets
train_dataset = PerovskiteDataset(
    sequences=train_sequences,
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

val_dataset = PerovskiteDataset(
    sequences=val_sequences,
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True
)

# Define a validation loop
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)
            features = batch["features"].to(device)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_ids == pad_idx)
            
            # Forward pass
            logits = model(input_ids, features, src_key_padding_mask=src_key_padding_mask)
            
            # Reshape for loss computation
            logits = logits.view(-1, vocab_size)
            targets = target_ids.view(-1)
            
            # Compute loss
            loss = criterion(logits, targets)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss
## COMMENT OUT SO I DONT HAVE TO RETRAIN THE MODEL
# # Implement Early Stopping    
# best_val_loss = float('inf')
# best_model_wts = copy.deepcopy(model.state_dict())
# patience = 5
# trigger_times = 0

# for epoch in range(num_epochs):
#     model.train()
#     epoch_loss = 0
#     progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
#     for batch in progress_bar:
#         input_ids = batch["input_ids"].to(device)
#         target_ids = batch["target_ids"].to(device)
#         features = batch["features"].to(device)
        
#         # Create src_key_padding_mask
#         src_key_padding_mask = (input_ids == pad_idx)
        
#         # Forward pass
#         logits = model(input_ids, features, src_key_padding_mask=src_key_padding_mask)
        
#         # Reshape for loss computation
#         logits = logits.view(-1, vocab_size)
#         targets = target_ids.view(-1)
        
#         # Compute loss
#         loss = criterion(logits, targets)
        
#         # Backward pass and optimization
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         # Accumulate loss
#         epoch_loss += loss.item()
        
#         # Update progress bar
#         progress_bar.set_postfix({"Loss": loss.item()})
    
#     # Calculate average training loss
#     avg_train_loss = epoch_loss / len(train_dataloader)
    
#     # Validate the model
#     val_loss = validate(model, val_dataloader, criterion, device)
    
#     print(f"Epoch [{epoch+1}/{num_epochs}] Training Loss: {avg_train_loss:.4f} | Validation Loss: {val_loss:.4f}")
    
#     # Check for improvement
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_model_wts = copy.deepcopy(model.state_dict())
#         trigger_times = 0
#         # Save the best model
#         torch.save(model.state_dict(), 'best_transformer_perovskite_model.pth')
#         print("Validation loss decreased. Saving the best model.")
#     else:
#         trigger_times += 1
#         print(f"No improvement in validation loss for {trigger_times} epochs.")
#         if trigger_times >= patience:
#             print("Early stopping triggered.")
#             break

# # Load the best model weights
# model.load_state_dict(best_model_wts)
# print("\nTraining complete. Best model loaded.")

# ----------------------------
# 11. Generate New Sequences
# ----------------------------
def generate_sequence_greedy(model, layer_to_id, id_to_layer, layer_features, max_len=15, device='cpu'):
    """
    Generates a sequence using greedy decoding.

    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        device (torch.device): Device to run the model on.

    Returns:
        List[str]: Generated sequence of layers.
    """
    model.eval()
    generated_ids = [layer_to_id["<SOS>"]]
    generated_features = [layer_features["<SOS>"]]
    
    with torch.no_grad():
        for _ in range(max_len):
            # Prepare input tensors
            input_tensor = torch.tensor([generated_ids], dtype=torch.long).to(device)        # (1, seq)
            feature_tensor = torch.tensor([generated_features], dtype=torch.float).to(device)  # (1, seq, feature_dim)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_tensor == layer_to_id["<PAD>"])  # (1, seq)
            
            # Get logits from the model
            logits = model(input_tensor, feature_tensor, src_key_padding_mask=src_key_padding_mask)  # (1, seq, vocab_size)
            
            # Get the last token's logits
            last_logits = logits[0, -1, :]  # (vocab_size)
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(last_logits, dim=0).cpu().numpy()
            
            # Select the token with the highest probability
            next_token_id = np.argmax(probabilities)
            
            # Append to generated_ids and generated_features
            generated_ids.append(next_token_id)
            generated_features.append(layer_features[id_to_layer[next_token_id]])
            
            # Stop if <EOS> is generated
            if next_token_id == layer_to_id["<EOS>"]:
                break
    
    # Convert generated IDs to layer names
    generated_sequence = [id_to_layer[id] for id in generated_ids]
    return generated_sequence

# Example usage

generated_seq = generate_sequence_greedy(
    model=model,
    layer_to_id=layer_to_id,
    id_to_layer=id_to_layer,
    layer_features=layer_features,
    max_len=15,
    device=device
)



print("\nGenerated Sequence (Greedy):")
print(" | ".join(generated_seq))

# # ----------------------------
# # 12. Optional: Visualize Embeddings with t-SNE
# # ----------------------------
# def visualize_embeddings_tsne(model, layer_to_id, id_to_layer, device='cpu'):
#     """
#     Visualizes the token embeddings using t-SNE.

#     Args:
#         model (nn.Module): Trained Transformer model.
#         layer_to_id (dict): Mapping from layer names to IDs.
#         id_to_layer (dict): Mapping from IDs to layer names.
#         device (torch.device): Device to run the model on.
#     """
#     model.eval()
#     with torch.no_grad():
#         # Get embeddings for all layers
#         layer_ids = torch.arange(vocab_size).to(device)  # (vocab_size)
#         embeddings = model.token_embedding(layer_ids)    # (vocab_size, embed_dim)
#         embeddings = embeddings.cpu().numpy()
    
#     # Apply t-SNE
#     tsne = TSNE(n_components=2, random_state=42)
#     embeds_2d = tsne.fit_transform(embeddings)
    
#     # Plot
#     plt.figure(figsize=(12, 10))
#     for idx, layer in id_to_layer.items():
#         plt.scatter(embeds_2d[idx, 0], embeds_2d[idx, 1], label=layer, marker='o')
#         plt.text(embeds_2d[idx, 0]+0.2, embeds_2d[idx, 1]+0.2, layer, fontsize=9)
#     plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
#     plt.title("Layer Embeddings Visualization with t-SNE")
#     plt.xlabel("t-SNE Dimension 1")
#     plt.ylabel("t-SNE Dimension 2")
#     plt.grid(True)
#     plt.show()

# # Call the visualization function
# visualize_embeddings_tsne(model, layer_to_id, id_to_layer, device=device)


  df = pd.read_csv('perovskite_database_query.csv')



Generated Sequence (Greedy):
<SOS> | MAPb0.99Sr0.01I3 | Carbon-nt | 2-MP | LiCoO2 | FA0.02MA0.98PbI3 | EA2MA2Pb3I10 | S:DIB | Ag, | TiO2-nanoplatelets | FA0.94MA0.6PbBr0.06I2.94 | CsPbBr2I | SnO2-np; TiO2-np; MXene | (PDMA)PbI4 | FA0.02MA0.98PbI3 | PTTI-2




# This generates a sequence but its not great

In [19]:
import torch
import torch.nn as nn
import numpy as np

# ----------------------------
# 12. Load Trained Model and Generate Sequences
# ----------------------------

# Initialize the model architecture with the same hyperparameters used during training
model = TransformerModel(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    feature_dim=feature_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    pad_idx=pad_idx
)

# Load trained weights
model.load_state_dict(torch.load('best_transformer_perovskite_model.pth', map_location=device))

# Set device and move the model to the appropriate device
model.to(device)
model.eval()

# ----------------------------
# 13. Define Generation Functions
# ----------------------------

def generate_sequence_with_min_length(model, layer_to_id, id_to_layer, layer_features, max_len=15, min_len=3, device='cpu', temperature=1.0):
    """
    Generates a sequence using greedy decoding with temperature scaling and minimum length constraint.

    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        min_len (int): Minimum number of layers before <EOS> can be generated.
        device (torch.device): Device to run the model on.
        temperature (float): Scaling factor for logits.

    Returns:
        List[str]: Generated sequence of layers.
    """
    model.eval()
    generated_ids = [layer_to_id["<SOS>"]]
    generated_features = [layer_features["<SOS>"]]

    with torch.no_grad():
        for step in range(max_len):
            # Prepare input tensors
            input_tensor = torch.tensor([generated_ids], dtype=torch.long).to(device)        # (1, seq)
            feature_tensor = torch.tensor([generated_features], dtype=torch.float).to(device)  # (1, seq, feature_dim)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_tensor == layer_to_id["<PAD>"])  # (1, seq)
            
            # Get logits from the model
            logits = model(input_tensor, feature_tensor, src_key_padding_mask=src_key_padding_mask)  # (1, seq, vocab_size)
            
            # Get the last token's logits
            last_logits = logits[0, -1, :]  # (vocab_size)
            
            # Apply temperature scaling
            scaled_logits = last_logits / temperature
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(scaled_logits, dim=0).cpu().numpy()
            
            # If minimum length not reached, mask out <EOS>
            if len(generated_ids) < min_len + 1:  # +1 because <SOS> is already included
                probabilities[layer_to_id["<EOS>"]] = 0.0
                probabilities /= probabilities.sum()  # Re-normalize
            
            # Select the token with the highest probability
            next_token_id = np.argmax(probabilities)
            
            # Append to generated_ids and generated_features
            generated_ids.append(next_token_id)
            generated_features.append(layer_features[id_to_layer[next_token_id]])
            
            # Stop if <EOS> is generated and minimum length is met
            if next_token_id == layer_to_id["<EOS>"] and len(generated_ids) >= min_len + 1:
                break

    # Convert generated IDs to layer names
    generated_sequence = [id_to_layer[id] for id in generated_ids]
    return generated_sequence

def generate_multiple_sequences(num_sequences, model, layer_to_id, id_to_layer, layer_features, max_len=15, min_len=3, device='cpu', temperature=1.0):
    """
    Generates multiple sequences and prints them.
    
    Args:
        num_sequences (int): Number of sequences to generate.
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        min_len (int): Minimum number of layers before <EOS> can be generated.
        device (torch.device): Device to run the model on.
        temperature (float): Scaling factor for logits.
    """
    for i in range(num_sequences):
        seq = generate_sequence_with_min_length(
            model=model,
            layer_to_id=layer_to_id,
            id_to_layer=id_to_layer,
            layer_features=layer_features,
            max_len=max_len,
            min_len=min_len,
            device=device,
            temperature=temperature
        )
        print(f"\nGenerated Sequence {i + 1}:")
        print(" | ".join(seq))

# ----------------------------
# 14. Generate and View Sequences
# ----------------------------
# Generate 5 sequences to see the model's behavior
generate_multiple_sequences(
    num_sequences=5,
    model=model,
    layer_to_id=layer_to_id,
    id_to_layer=id_to_layer,
    layer_features=layer_features,
    max_len=15,
    min_len=3,
    device=device,
    temperature=0.8  # Adjust temperature as desired
)



Generated Sequence 1:
<SOS> | Unknown | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 2:
<SOS> | Unknown | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 3:
<SOS> | Unknown | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 4:
<SOS> | Unknown | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 5:
<SOS> | Unknown | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>


  model.load_state_dict(torch.load('best_transformer_perovskite_model.pth', map_location=device))


# updated generation of sequences

In [25]:
import torch
import torch.nn as nn
import numpy as np
import warnings

# ----------------------------
# 12. Load Trained Model and Generate Sequences
# ----------------------------

# Initialize the model architecture with the same hyperparameters used during training
model = TransformerModel(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    feature_dim=feature_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    pad_idx=pad_idx
)

# Check PyTorch version
torch_version = torch.__version__
print(f"PyTorch Version: {torch_version}")

# Suppress the specific FutureWarning if using a trusted model and PyTorch < 2.0
if torch_version < "2.0":
    warnings.filterwarnings(
        "ignore",
        category=FutureWarning,
        message="You are using `torch.load` with `weights_only=False`"
    )

# Load trained weights
try:
    # For PyTorch versions that support 'weights_only'
    model.load_state_dict(torch.load('best_transformer_perovskite_model.pth', map_location=device, weights_only=True))
except TypeError:
    # If 'weights_only' is not supported, load normally
    model.load_state_dict(torch.load('best_transformer_perovskite_model.pth', map_location=device))
    if torch_version < "2.0":
        print("Warning: 'weights_only' parameter not supported in this PyTorch version. Proceeding without it.")

# Set device and move the model to the appropriate device
model.to(device)
model.eval()

# ----------------------------
# 13. Define Generation Functions
# ----------------------------

def generate_sequence_with_min_length(
    model,
    layer_to_id,
    id_to_layer,
    layer_features,
    max_len=15,
    min_len=3,
    device='cpu',
    temperature=1.0
):
    """
    Generates a sequence using greedy decoding with temperature scaling and minimum length constraint.

    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        min_len (int): Minimum number of layers before <EOS> can be generated.
        device (torch.device): Device to run the model on.
        temperature (float): Scaling factor for logits.

    Returns:
        List[str]: Generated sequence of layers.
    """
    model.eval()
    generated_ids = [layer_to_id["<SOS>"]]
    generated_features = [layer_features["<SOS>"]]

    with torch.no_grad():
        for step in range(max_len):
            # Prepare input tensors
            input_tensor = torch.tensor([generated_ids], dtype=torch.long).to(device)        # (1, seq)
            feature_tensor = torch.tensor([generated_features], dtype=torch.float).to(device)  # (1, seq, feature_dim)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_tensor == layer_to_id["<PAD>"])  # (1, seq)
            
            # Get logits from the model
            logits = model(input_tensor, feature_tensor, src_key_padding_mask=src_key_padding_mask)  # (1, seq, vocab_size)
            
            # Get the last token's logits
            last_logits = logits[0, -1, :]  # (vocab_size)
            
            # Apply temperature scaling
            scaled_logits = last_logits / temperature
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(scaled_logits, dim=0).cpu().numpy()
            
            # If minimum length not reached, mask out <EOS>
            if len(generated_ids) < min_len + 1:  # +1 because <SOS> is already included
                probabilities[layer_to_id["<EOS>"]] = 0.0
                probabilities /= probabilities.sum()  # Re-normalize
            
            # Select the token with the highest probability
            next_token_id = np.argmax(probabilities)
            
            # Handle unknown tokens by mapping to <PAD>
            if next_token_id not in id_to_layer:
                next_token_id = layer_to_id["<PAD>"]
            
            # Append to generated_ids and generated_features
            generated_ids.append(next_token_id)
            generated_features.append(layer_features["<PAD>"])
            
            # Stop if <EOS> is generated and minimum length is met
            if next_token_id == layer_to_id["<EOS>"] and len(generated_ids) >= min_len + 1:
                break

    # Convert generated IDs to layer names
    generated_sequence = [id_to_layer[id] for id in generated_ids]
    return generated_sequence

def generate_multiple_sequences(
    num_sequences,
    model,
    layer_to_id,
    id_to_layer,
    layer_features,
    max_len=15,
    min_len=3,
    device='cpu',
    temperature=1.0
):
    """
    Generates multiple sequences and prints them.
    
    Args:
        num_sequences (int): Number of sequences to generate.
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        min_len (int): Minimum number of layers before <EOS> can be generated.
        device (torch.device): Device to run the model on.
        temperature (float): Scaling factor for logits.
    """
    for i in range(num_sequences):
        seq = generate_sequence_with_min_length(
            model=model,
            layer_to_id=layer_to_id,
            id_to_layer=id_to_layer,
            layer_features=layer_features,
            max_len=max_len,
            min_len=min_len,
            device=device,
            temperature=temperature
        )
        print(f"\nGenerated Sequence {i + 1}:")
        print(" | ".join(seq))

# ----------------------------
# 14. Generate and View Sequences
# ----------------------------
# Generate 5 sequences to see the model's behavior
generate_multiple_sequences(
    num_sequences=5,
    model=model,
    layer_to_id=layer_to_id,
    id_to_layer=id_to_layer,
    layer_features=layer_features,
    max_len=15,
    min_len=3,
    device=device,
    temperature=0.8  # Adjust temperature as desired
)


PyTorch Version: 2.5.1

Generated Sequence 1:
<SOS> | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 2:
<SOS> | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 3:
<SOS> | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 4:
<SOS> | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>

Generated Sequence 5:
<SOS> | Unknown | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | Cs0.05FA0.79MA0.16PbBr0.45I2.55 | <EOS>


In [16]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import math
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
import copy

# ----------------------------
# 1. Define Special Tokens
# ----------------------------
SPECIAL_TOKENS = {
    "<PAD>": 0,
    "<SOS>": 1,
    "<EOS>": 2
}

# ----------------------------
# 2. Load and Inspect Data
# ----------------------------
# Load the CSV file
df = pd.read_csv('perovskite_database_query.csv')

# Display the first few rows
print("Initial DataFrame:")
print(df.head())

# Check for missing values in relevant columns
print("\nMissing values in 'Cell_stack_sequence':", df['Cell_stack_sequence'].isna().sum())
print("Missing values in 'Perovskite_composition_long_form':", df['Perovskite_composition_long_form'].isna().sum())

# Handle missing 'Cell_stack_sequence'
# Option 1: Remove rows with missing sequences
# df = df.dropna(subset=['Cell_stack_sequence']).reset_index(drop=True)

# Option 2: Fill missing sequences with '<PAD>'
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].fillna('<PAD>')

# Handle missing 'Perovskite_composition_long_form'
# Option 1: Remove rows with missing compositions
# df = df.dropna(subset=['Perovskite_composition_long_form']).reset_index(drop=True)

# Option 2: Fill missing compositions with 'Perovskite' to leave the sequence unchanged
df['Perovskite_composition_long_form'] = df['Perovskite_composition_long_form'].fillna('Perovskite')

# ----------------------------
# 3. Enhance 'Cell_stack_sequence'
# ----------------------------
def replace_perovskite_layer(sequence, composition):
    """
    Replaces all occurrences of 'Perovskite' in the sequence with the provided composition.

    Args:
        sequence (List[str]): List of layer tokens.
        composition (str): The chemical/structural composition to replace 'Perovskite'.

    Returns:
        List[str]: Modified sequence with 'Perovskite' replaced.
    """
    return [composition if layer == 'Perovskite' else layer for layer in sequence]

# Split the 'Cell_stack_sequence' into lists of layers
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].apply(lambda x: [layer.strip() for layer in x.split('|')])

# Apply the replacement to create a new column
df['Cell_stack_sequence_enhanced'] = df.apply(
    lambda row: replace_perovskite_layer(row['Cell_stack_sequence'], row['Perovskite_composition_long_form']),
    axis=1
)

# Join the list back into a string with ' | ' separator
df['Cell_stack_sequence_enhanced'] = df['Cell_stack_sequence_enhanced'].apply(lambda x: ' | '.join(x))

# Display a sample of the new column
print("\nSample Enhanced Sequences:")
print(df[['Cell_stack_sequence_enhanced']].head())

# ----------------------------
# 4. Build layer_to_id Dictionary from Enhanced Sequences
# ----------------------------
def build_layer_to_id(sequences, special_tokens):
    """
    Builds a dictionary mapping each unique layer to a unique integer ID.

    Args:
        sequences (List[str]): List of layer sequences as strings.
        special_tokens (dict): Dictionary of special tokens and their IDs.

    Returns:
        layer_to_id (dict): Mapping from layer names to unique IDs.
    """
    unique_layers = set()
    for seq in sequences:
        layers = [layer.strip() for layer in seq.split('|')]
        unique_layers.update(layers)
    
    # Remove special tokens if present in unique_layers
    unique_layers -= set(special_tokens.keys())
    
    # Sort the layers for consistency
    unique_layers = sorted(list(unique_layers))
    
    # Assign unique IDs starting after special tokens
    layer_to_id = {token: idx for token, idx in special_tokens.items()}
    current_id = len(special_tokens)
    
    for layer in unique_layers:
        if layer not in layer_to_id:
            layer_to_id[layer] = current_id
            current_id += 1
    
    return layer_to_id

# Build the dictionary using the 'Cell_stack_sequence_enhanced' column
layer_to_id = build_layer_to_id(df['Cell_stack_sequence_enhanced'].tolist(), SPECIAL_TOKENS)
id_to_layer = {v: k for k, v in layer_to_id.items()}
vocab_size = len(layer_to_id)

print("\nLayer to ID Mapping:")
for layer, idx in layer_to_id.items():
    print(f"{layer}: {idx}")

# ----------------------------
# 5. Define Layer Features
# ----------------------------
feature_dim = 5  # Example feature dimensionality

# Assign a feature vector to each layer
np.random.seed(42)  # For reproducibility
layer_features = {}
for layer in layer_to_id.keys():
    if layer in ["<PAD>", "<SOS>", "<EOS>"]:
        layer_features[layer] = [0.0] * feature_dim
    else:
        # Replace with actual descriptors as needed
        layer_features[layer] = np.random.rand(feature_dim).tolist()

print("\nLayer Features:")
for layer, features in list(layer_features.items())[:5]:  # Display first 5 for brevity
    print(f"{layer}: {features}")

# ----------------------------
# 6. Define the Dataset Class
# ----------------------------
class PerovskiteDataset(Dataset):
    def __init__(self, sequences, layer_to_id, layer_features, max_len=None):
        """
        Args:
            sequences (List[str]): List of enhanced layer sequences as strings.
            layer_to_id (dict): Mapping from layer names to integer IDs.
            layer_features (dict): Mapping from layer names to feature vectors.
            max_len (int, optional): Maximum sequence length. If None, uses the longest sequence.
        """
        self.sequences = sequences
        self.layer_to_id = layer_to_id
        self.layer_features = layer_features
        if not max_len:
            self.max_len = max(len(seq.split('|')) + 2 for seq in self.sequences)  # +2 for <SOS> and <EOS>
        else:
            self.max_len = max_len
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        # Get the sequence and split into layers
        seq = self.sequences[idx]
        layers = [layer.strip() for layer in seq.split('|')]
        
        # Convert layers to IDs
        seq_ids = [self.layer_to_id.get(layer, self.layer_to_id["<PAD>"]) for layer in layers]
        
        # Add <SOS> and <EOS> if not already present
        if seq_ids[0] != self.layer_to_id["<SOS>"]:
            seq_ids = [self.layer_to_id["<SOS>"]] + seq_ids
        if seq_ids[-1] != self.layer_to_id["<EOS>"]:
            seq_ids = seq_ids + [self.layer_to_id["<EOS>"]]
        
        # Update max_len if necessary
        if len(seq_ids) > self.max_len:
            self.max_len = len(seq_ids)
        
        # Pad sequences
        padded_seq_ids = seq_ids + [self.layer_to_id["<PAD>"]] * (self.max_len - len(seq_ids))
        
        # Create input and target sequences
        input_ids = padded_seq_ids[:-1]
        target_ids = padded_seq_ids[1:]
        
        # Extract and pad features
        features = [self.layer_features[id_to_layer[id]] for id in input_ids]
        padded_features = features + [[0.0] * feature_dim] * (self.max_len - 1 - len(features))
        
        # Convert to tensors
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        target_ids = torch.tensor(target_ids, dtype=torch.long)
        features = torch.tensor(padded_features, dtype=torch.float)
        
        return {
            "input_ids": input_ids,
            "target_ids": target_ids,
            "features": features
        }

# ----------------------------
# 7. Create DataLoader
# ----------------------------
# Define DataLoader parameters
batch_size = 32
shuffle = True

# Initialize the dataset
dataset = PerovskiteDataset(
    sequences=df['Cell_stack_sequence_enhanced'].tolist(),
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

# Create DataLoader with num_workers=0 to avoid multiprocessing issues
dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True  # If using GPU
)

# ----------------------------
# 8. Define the Transformer-Based Model
# ----------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, d_model)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, d_model)
        """
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, feature_dim, nhead, num_encoder_layers, dim_feedforward, dropout=0.1, pad_idx=0):
        super(TransformerModel, self).__init__()
        
        # Token Embeddings
        self.token_embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        
        # Positional Encoding initialized with embed_dim + 32
        self.pos_encoder = PositionalEncoding(embed_dim + 32)
        
        # Feature MLP
        self.feature_mlp = nn.Sequential(
            nn.Linear(feature_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32)
        )
        
        # Transformer Encoder with d_model = embed_dim + 32
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=embed_dim + 32,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)
        
        # Output Layer
        self.fc_out = nn.Linear(embed_dim + 32, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, features, src_key_padding_mask):
        """
        Args:
            input_ids: Tensor of shape (batch_size, seq_len)
            features: Tensor of shape (batch_size, seq_len, feature_dim)
            src_key_padding_mask: Tensor of shape (batch_size, seq_len), True for padding tokens
        Returns:
            logits: Tensor of shape (batch_size, seq_len, vocab_size)
        """
        # Embed tokens
        token_embeds = self.token_embedding(input_ids)  # (batch, seq, embed_dim)
        
        # Process features
        feature_embeds = self.feature_mlp(features)  # (batch, seq, 32)
        
        # Concatenate embeddings
        combined = torch.cat((token_embeds, feature_embeds), dim=2)  # (batch, seq, embed_dim + 32)
        
        # Add positional encoding
        combined = self.pos_encoder(combined)  # (batch, seq, embed_dim + 32)
        
        # Apply dropout
        combined = self.dropout(combined)
        
        # Prepare for Transformer: transpose to (seq, batch, embed_dim + 32)
        combined = combined.transpose(0, 1)  # (seq, batch, embed_dim + 32)
        
        # Pass through Transformer Encoder
        encoded = self.transformer_encoder(combined, src_key_padding_mask=src_key_padding_mask)  # (seq, batch, embed_dim + 32)
        
        # Transpose back to (batch, seq, embed_dim + 32)
        encoded = encoded.transpose(0, 1)  # (batch, seq, embed_dim + 32)
        
        # Output layer
        logits = self.fc_out(encoded)  # (batch, seq, vocab_size)
        
        return logits

# ----------------------------
# 9. Initialize Model, Loss, Optimizer
# ----------------------------
embed_dim = 128
nhead = 8
num_encoder_layers = 4
dim_feedforward = 256
dropout = 0.1
learning_rate = 1e-4
pad_idx = layer_to_id["<PAD>"]
num_epochs = 30

model = TransformerModel(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    feature_dim=feature_dim,
    nhead=nhead,
    num_encoder_layers=num_encoder_layers,
    dim_feedforward=dim_feedforward,
    dropout=dropout,
    pad_idx=pad_idx
)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# ----------------------------
# 10. Training Loop with Validation and Early Stopping
# ----------------------------
# Split the dataset into training and validation sets
train_sequences, val_sequences = train_test_split(
    df['Cell_stack_sequence_enhanced'].tolist(),
    test_size=0.2,
    random_state=42
)

# Create separate datasets
train_dataset = PerovskiteDataset(
    sequences=train_sequences,
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

val_dataset = PerovskiteDataset(
    sequences=val_sequences,
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

# Create DataLoaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=0,  # Set to 0 to disable multiprocessing
    pin_memory=True
)

# Define a validation loop
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validation"):
            input_ids = batch["input_ids"].to(device)
            target_ids = batch["target_ids"].to(device)
            features = batch["features"].to(device)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_ids == pad_idx)
            
            # Forward pass
            logits = model(input_ids, features, src_key_padding_mask=src_key_padding_mask)
            
            # Reshape for loss computation
            logits = logits.view(-1, vocab_size)
            targets = target_ids.view(-1)
            
            # Compute loss
            loss = criterion(logits, targets)
            total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Implement Early Stopping
best_val_loss = float('inf')
best_model_wts = copy.deepcopy(model.state_dict())
patience = 5
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        target_ids = batch["target_ids"].to(device)
        features = batch["features"].to(device)
        
        # Create src_key_padding_mask
        src_key_padding_mask = (input_ids == pad_idx)
        
        # Forward pass
        logits = model(input_ids, features, src_key_padding_mask=src_key_padding_mask)
        
        # Reshape for loss computation
        logits = logits.view(-1, vocab_size)
        targets = target_ids.view(-1)
        
        # Compute loss
        loss = criterion(logits, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Accumulate loss
        epoch_loss += loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({"Loss": loss.item()})
    
    # Calculate average training loss
    avg_train_loss = epoch_loss / len(train_dataloader)
    
    # Validate the model
    val_loss = validate(model, val_dataloader, criterion, device)
    
    print(f"Epoch [{epoch+1}/{num_epochs}] Training Loss: {avg_train_loss:.4f} | Validation Loss: {val_loss:.4f}")
    
    # Check for improvement
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model_wts = copy.deepcopy(model.state_dict())
        trigger_times = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_transformer_perovskite_model.pth')
        print("Validation loss decreased. Saving the best model.")
    else:
        trigger_times += 1
        print(f"No improvement in validation loss for {trigger_times} epochs.")
        if trigger_times >= patience:
            print("Early stopping triggered.")
            break

# Load the best model weights
model.load_state_dict(best_model_wts)
print("\nTraining complete. Best model loaded.")

# ----------------------------
# 11. Generate New Sequences
# ----------------------------
def generate_sequence_greedy(model, layer_to_id, id_to_layer, layer_features, max_len=15, device='cpu'):
    """
    Generates a sequence using greedy decoding.

    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        device (torch.device): Device to run the model on.

    Returns:
        List[str]: Generated sequence of layers.
    """
    model.eval()
    generated_ids = [layer_to_id["<SOS>"]]
    generated_features = [layer_features["<SOS>"]]
    
    with torch.no_grad():
        for _ in range(max_len):
            # Prepare input tensors
            input_tensor = torch.tensor([generated_ids], dtype=torch.long).to(device)        # (1, seq)
            feature_tensor = torch.tensor([generated_features], dtype=torch.float).to(device)  # (1, seq, feature_dim)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_tensor == layer_to_id["<PAD>"])  # (1, seq)
            
            # Get logits from the model
            logits = model(input_tensor, feature_tensor, src_key_padding_mask=src_key_padding_mask)  # (1, seq, vocab_size)
            
            # Get the last token's logits
            last_logits = logits[0, -1, :]  # (vocab_size)
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(last_logits, dim=0).cpu().numpy()
            
            # Select the token with the highest probability
            next_token_id = np.argmax(probabilities)
            
            # Append to generated_ids and generated_features
            generated_ids.append(next_token_id)
            generated_features.append(layer_features[id_to_layer[next_token_id]])
            
            # Stop if <EOS> is generated
            if next_token_id == layer_to_id["<EOS>"]:
                break
    
    # Convert generated IDs to layer names
    generated_sequence = [id_to_layer[id] for id in generated_ids]
    return generated_sequence

# ----------------------------
# 12. Optional: Visualize Embeddings with t-SNE
# ----------------------------
def visualize_embeddings_tsne(model, layer_to_id, id_to_layer, device='cpu'):
    """
    Visualizes the token embeddings using t-SNE.

    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        device (torch.device): Device to run the model on.
    """
    model.eval()
    with torch.no_grad():
        # Get embeddings for all layers
        layer_ids = torch.arange(vocab_size).to(device)  # (vocab_size)
        embeddings = model.token_embedding(layer_ids)    # (vocab_size, embed_dim)
        embeddings = embeddings.cpu().numpy()
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    embeds_2d = tsne.fit_transform(embeddings)
    
    # Plot
    plt.figure(figsize=(12, 10))
    for idx, layer in id_to_layer.items():
        plt.scatter(embeds_2d[idx, 0], embeds_2d[idx, 1], label=layer, marker='o')
        plt.text(embeds_2d[idx, 0]+0.2, embeds_2d[idx, 1]+0.2, layer, fontsize=9)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title("Layer Embeddings Visualization with t-SNE")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.grid(True)
    plt.show()

# ----------------------------
# 13. Generate and View Sequences
# ----------------------------
def generate_multiple_sequences(num_sequences, model, layer_to_id, id_to_layer, layer_features, max_len=15, min_len=3, device='cpu', temperature=1.0):
    """
    Generates multiple sequences and prints them.
    
    Args:
        num_sequences (int): Number of sequences to generate.
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        min_len (int): Minimum number of layers before <EOS> can be generated.
        device (torch.device): Device to run the model on.
        temperature (float): Scaling factor for logits.
    """
    for i in range(num_sequences):
        seq = generate_sequence_with_min_length(
            model=model,
            layer_to_id=layer_to_id,
            id_to_layer=id_to_layer,
            layer_features=layer_features,
            max_len=max_len,
            min_len=min_len,
            device=device,
            temperature=temperature
        )
        print(f"\nGenerated Sequence {i + 1}:")
        print(" | ".join(seq))

# ----------------------------
# 14. Generate 5 Sequences to See the Model's Behavior
# ----------------------------
if __name__ == "__main__":
    # Generate 5 sequences
    generate_multiple_sequences(
        num_sequences=5,
        model=model,
        layer_to_id=layer_to_id,
        id_to_layer=id_to_layer,
        layer_features=layer_features,
        max_len=15,
        min_len=3,
        device=device,
        temperature=0.8  # Adjust temperature as desired
    )
    
    # Optional: Visualize Embeddings with t-SNE
    visualize_embeddings_tsne(model, layer_to_id, id_to_layer, device=device)


  df = pd.read_csv('perovskite_database_query.csv')


Initial DataFrame:
   Ref_ID  Ref_ID_temp Ref_name_of_person_entering_the_data  \
0       1            1                       Adam Hultqvist   
1       2            2                       Adam Hultqvist   
2       3            3                       Adam Hultqvist   
3       4            4                       Adam Hultqvist   
4       5            5                       Adam Hultqvist   

   Ref_data_entered_by_author     Ref_DOI_number Ref_lead_author  \
0                       False  10.1021/jp5126624    Sabba et al.   
1                       False  10.1021/jp5126624    Sabba et al.   
2                       False  10.1021/jp5126624    Sabba et al.   
3                       False  10.1021/jp5126624    Sabba et al.   
4                       False  10.1021/jp5126624    Sabba et al.   

  Ref_publication_date                          Ref_journal  \
0           2015-01-06  The Journal of Physical Chemistry C   
1           2015-01-06  The Journal of Physical Chemistry C   
2   




Sample Enhanced Sequences:
                        Cell_stack_sequence_enhanced
0  SLG | FTO | TiO2-c | TiO2-mp | CsSnI3 | Spiro-...
1  SLG | FTO | TiO2-c | TiO2-mp | CsSnBr0.3I2.7 |...
2  SLG | FTO | TiO2-c | TiO2-mp | CsSnBr1.5I1.5 |...
3  SLG | FTO | TiO2-c | TiO2-mp | CsSnBr2.7I0.3 |...
4  SLG | FTO | TiO2-c | TiO2-mp | CsSnBr3 | Spiro...

Layer to ID Mapping:
<PAD>: 0
<SOS>: 1
<EOS>: 2
((CH3)3S)2SnBr2I4: 3
((CH3)3S)2SnBrI5: 4
((CH3)3S)2SnCl2I4: 5
((CH3)3S)2SnClI5: 6
((CH3)3S)2SnI6: 7
(1,6-di{3-[2-(4- methylphenyl)vinyl]carbazol-9-yl}hexane: 8
(1.3-Pr(NH3)2)0.5Pb1.0I3: 9
(10-butyl-3,7-diphenylphenoxazine): 10
(2Z,2'Z)-2,2'-(((2,4-dimethylphenyl) azanediyl) bis([1,1'-biphenyl]-4',4-diyl)) bis(3-(4-(diphenylamino) phenyl) acrylonitrile: 11
(2Z,2'Z)-2,2'-((10-(2-ethylhexyl)-10H-phenothiazine-3,7-diyl) bis(4,1- phenylene)) bis(3-(4-(diphenylamino) phenyl) acrylonitrile: 12
(2Z,2′Z)-3,3′- (5,5′-(2,7-dioctyl-1,3,6,8-tetraoxo-1,2,3,6,7,8-hexahydrobenzo [lmn][3,8]phenanthroline-4,9-diyl)b

Epoch 1/30: 100%|██████████| 1075/1075 [00:52<00:00, 20.55it/s, Loss=1.02] 
Validation: 100%|██████████| 269/269 [00:01<00:00, 138.49it/s]


Epoch [1/30] Training Loss: 1.7167 | Validation Loss: 0.7684
Validation loss decreased. Saving the best model.


Epoch 2/30: 100%|██████████| 1075/1075 [00:53<00:00, 20.28it/s, Loss=0.452]
Validation: 100%|██████████| 269/269 [00:02<00:00, 123.40it/s]


Epoch [2/30] Training Loss: 0.6692 | Validation Loss: 0.5232
Validation loss decreased. Saving the best model.


Epoch 3/30: 100%|██████████| 1075/1075 [00:55<00:00, 19.27it/s, Loss=0.28] 
Validation: 100%|██████████| 269/269 [00:02<00:00, 100.91it/s]


Epoch [3/30] Training Loss: 0.4944 | Validation Loss: 0.4147
Validation loss decreased. Saving the best model.


Epoch 4/30: 100%|██████████| 1075/1075 [00:52<00:00, 20.28it/s, Loss=0.228]
Validation: 100%|██████████| 269/269 [00:02<00:00, 114.54it/s]


Epoch [4/30] Training Loss: 0.3959 | Validation Loss: 0.3435
Validation loss decreased. Saving the best model.


Epoch 5/30:  20%|█▉        | 213/1075 [00:10<00:44, 19.58it/s, Loss=0.376]


KeyboardInterrupt: 

  df = pd.read_csv('perovskite_database_query.csv')


Initial DataFrame:
   Ref_ID  Ref_ID_temp Ref_name_of_person_entering_the_data  \
0       1            1                       Adam Hultqvist   
1       2            2                       Adam Hultqvist   
2       3            3                       Adam Hultqvist   
3       4            4                       Adam Hultqvist   
4       5            5                       Adam Hultqvist   

   Ref_data_entered_by_author     Ref_DOI_number Ref_lead_author  \
0                       False  10.1021/jp5126624    Sabba et al.   
1                       False  10.1021/jp5126624    Sabba et al.   
2                       False  10.1021/jp5126624    Sabba et al.   
3                       False  10.1021/jp5126624    Sabba et al.   
4                       False  10.1021/jp5126624    Sabba et al.   

  Ref_publication_date                          Ref_journal  \
0           2015-01-06  The Journal of Physical Chemistry C   
1           2015-01-06  The Journal of Physical Chemistry C   
2   

KeyError: 'Cell_stack_sequence_enhanced'

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

# Optional: For visualization
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

############################################################
# 1. Define Special Tokens
############################################################

SPECIAL_TOKENS = {
    "<PAD>": 0,
    "<SOS>": 1,
    "<EOS>": 2
}

############################################################
# 2. Create a Toy Dataset (Replace with Actual Data)
############################################################

# Example data: list of layer sequences as strings
# In practice, load your data from a CSV or other source
# For example:
# df = pd.read_csv('your_data.csv')
# sequences = df['Cell_stack_sequence'].tolist()

# Creating a toy DataFrame for demonstration
data = {
    'Cell_stack_sequence': [
        "SLG,ITO,TiO2-c,Perovskite,Spiro-MeOTAD,Au",
        "ITO,Perovskite,Au",
        "SLG,FTO,TiO2,Perovskite,HTL,Au",
        "SLG,ITO,TiO2-c,Perovskite,Spiro-MeOTAD,Au",
        "ITO,TiO2-c,Perovskite,Au",
        # Add more sequences as needed
    ]
}

df = pd.DataFrame(data)

# Split the sequences into lists
df['Cell_stack_sequence'] = df['Cell_stack_sequence'].apply(lambda x: x.split(','))

############################################################
# 3. Automatically Build layer_to_id Dictionary
############################################################

def build_layer_to_id(sequences, special_tokens):
    """
    Builds a dictionary mapping each unique layer to a unique integer ID.
    
    Args:
        sequences (List[List[str]]): List of layer sequences.
        special_tokens (dict): Dictionary of special tokens and their IDs.
    
    Returns:
        layer_to_id (dict): Mapping from layer names to unique IDs.
    """
    unique_layers = set()
    for seq in sequences:
        unique_layers.update(seq)
    unique_layers = sorted(list(unique_layers))  # Sort for consistency
    
    # Start assigning IDs after the special tokens
    layer_to_id = {token: idx for token, idx in special_tokens.items()}
    current_id = len(special_tokens)
    for layer in unique_layers:
        if layer not in layer_to_id:
            layer_to_id[layer] = current_id
            current_id += 1
    return layer_to_id

layer_to_id = build_layer_to_id(df['Cell_stack_sequence'].tolist(), SPECIAL_TOKENS)
id_to_layer = {v: k for k, v in layer_to_id.items()}
vocab_size = len(layer_to_id)

print("Layer to ID Mapping:")
for layer, idx in layer_to_id.items():
    print(f"{layer}: {idx}")

############################################################
# 4. Define Layer Features (Chemical/Structural Descriptors)
############################################################

# Example: Assigning a feature vector to each layer
# In practice, these should be meaningful chemical/structural descriptors
# For demonstration, we'll use random vectors or predefined ones

# Define a feature dimension
feature_dim = 5  # Example: 5 chemical/structural features

# Assign a feature vector to each layer
np.random.seed(42)  # For reproducibility
layer_features = {}
for layer in layer_to_id.keys():
    if layer in ["<PAD>", "<SOS>", "<EOS>"]:
        layer_features[layer] = [0.0] * feature_dim
    else:
        # Random features for demonstration; replace with actual descriptors
        layer_features[layer] = np.random.rand(feature_dim).tolist()

print("\nLayer Features:")
for layer, features in layer_features.items():
    print(f"{layer}: {features}")

############################################################
# 5. Dataset Class with Padding and Feature Extraction
############################################################

class DeviceStackDataset(Dataset):
    def __init__(self, sequences, layer_to_id, layer_features, max_len=None):
        """
        Args:
            sequences (List[List[str]]): List of layer sequences.
            layer_to_id (dict): Mapping from layer names to integer IDs.
            layer_features (dict): Mapping from layer names to feature vectors.
            max_len (int, optional): Maximum sequence length. If None, uses the longest sequence.
        """
        self.sequences = sequences
        self.layer_to_id = layer_to_id
        self.layer_features = layer_features
        if not max_len:
            self.max_len = max(len(seq) + 2 for seq in sequences)  # +2 for <SOS> and <EOS>
        else:
            self.max_len = max_len
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        
        # Add <SOS> and <EOS> tokens
        seq = ["<SOS>"] + seq + ["<EOS>"]
        
        # Convert tokens to IDs
        seq_ids = [self.layer_to_id[token] for token in seq]
        
        # Pad sequence
        padded_seq_ids = seq_ids + [self.layer_to_id["<PAD>"]] * (self.max_len - len(seq_ids))
        
        # Create input and target sequences
        # Input: <SOS> to second last token
        # Target: first token to <EOS>
        input_ids = padded_seq_ids[:-1]
        target_ids = padded_seq_ids[1:]
        
        # Extract and pad features
        features = [self.layer_features[token] for token in seq]
        padded_features = features + [[0.0] * feature_dim] * (self.max_len - len(seq))
        padded_features = padded_features[:-1]  # Align with input_ids
        
        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "target_ids": torch.tensor(target_ids, dtype=torch.long),
            "features": torch.tensor(padded_features, dtype=torch.float)
        }

############################################################
# 6. Create DataLoader for Batching
############################################################

# Initialize the dataset
dataset = DeviceStackDataset(
    sequences=df['Cell_stack_sequence'].tolist(),
    layer_to_id=layer_to_id,
    layer_features=layer_features
)

# Create DataLoader
batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

############################################################
# 7. Define the Transformer-Based Model
############################################################

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        # Create a long enough P matrix
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-np.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, seq_len, embed_dim)
        """
        x = x + self.pe[:, :x.size(1), :]
        return x

class TransformerGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, feature_dim, hidden_dim, num_layers=2, nhead=8, dropout=0.1, pad_idx=0):
        super(TransformerGenerator, self).__init__()
        
        # Embedding layers
        self.token_embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.pos_encoder = PositionalEncoding(embed_dim)
        
        # MLP for feature enhancement
        self.feature_mlp = nn.Sequential(
            nn.Linear(feature_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 32)
        )
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim + 32, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # Decoder: maps Transformer outputs to vocabulary logits
        self.fc_out = nn.Linear(embed_dim + 32, vocab_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input_ids, features, src_key_padding_mask=None):
        """
        Args:
            input_ids: Tensor of shape (batch_size, seq_len)
            features: Tensor of shape (batch_size, seq_len, feature_dim)
            src_key_padding_mask: Tensor of shape (batch_size, seq_len), True for padding tokens
        Returns:
            logits: Tensor of shape (batch_size, seq_len, vocab_size)
        """
        # 1. Token Embedding
        token_embeds = self.token_embedding(input_ids)  # (batch, seq, embed_dim)
        
        # 2. Feature Enhancement
        feature_embeds = self.feature_mlp(features)  # (batch, seq, 32)
        
        # 3. Concatenate Token Embeddings and Feature Embeddings
        combined = torch.cat((token_embeds, feature_embeds), dim=2)  # (batch, seq, embed_dim + 32)
        
        # 4. Positional Encoding
        combined = self.pos_encoder(combined)  # (batch, seq, embed_dim + 32)
        
        # 5. Prepare for Transformer: Transformer expects (seq, batch, feature)
        combined = combined.transpose(0, 1)  # (seq, batch, embed_dim + 32)
        
        # 6. Transformer Encoding
        encoded = self.transformer_encoder(combined, src_key_padding_mask=src_key_padding_mask)  # (seq, batch, embed_dim + 32)
        
        # 7. Map to Vocabulary
        encoded = encoded.transpose(0, 1)  # (batch, seq, embed_dim + 32)
        logits = self.fc_out(encoded)  # (batch, seq, vocab_size)
        
        return logits

############################################################
# 8. Initialize the Model, Loss Function, and Optimizer
############################################################

# Hyperparameters
embed_dim = 128
hidden_dim = 256  # Not used directly in Transformer, but kept for possible extensions
num_layers = 2
nhead = 8
dropout = 0.1
learning_rate = 1e-4
pad_idx = layer_to_id["<PAD>"]

# Initialize the model
model = TransformerGenerator(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    feature_dim=feature_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    nhead=nhead,
    dropout=dropout,
    pad_idx=pad_idx
)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

############################################################
# 9. Training Loop with Padding Mask
############################################################

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)      # (batch, seq)
        target_ids = batch["target_ids"].to(device)    # (batch, seq)
        features = batch["features"].to(device)        # (batch, seq, feature_dim)
        
        # Create src_key_padding_mask: True for padded elements
        src_key_padding_mask = (input_ids == pad_idx)  # (batch, seq)
        
        # Forward pass
        logits = model(input_ids, features, src_key_padding_mask=src_key_padding_mask)  # (batch, seq, vocab_size)
        
        # Compute loss
        # Reshape logits and targets for CrossEntropyLoss
        logits = logits.view(-1, vocab_size)         # (batch * seq, vocab_size)
        targets = target_ids.view(-1)                # (batch * seq)
        loss = criterion(logits, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

############################################################
# 10. Sequence Generation Function
############################################################

def generate_sequence(model, layer_to_id, id_to_layer, layer_features, max_len=10, device='cpu'):
    """
    Generates a sequence starting from <SOS> until <EOS> or max_len.
    
    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        layer_features (dict): Mapping from layer names to feature vectors.
        max_len (int): Maximum length of the generated sequence.
        device (torch.device): Device to run the model on.
    
    Returns:
        List[str]: Generated sequence of layers.
    """
    model.eval()
    generated_ids = [layer_to_id["<SOS>"]]
    generated_features = [layer_features["<SOS>"]]
    
    with torch.no_grad():
        for _ in range(max_len):
            input_tensor = torch.tensor([generated_ids], dtype=torch.long).to(device)        # (1, seq)
            feature_tensor = torch.tensor([generated_features], dtype=torch.float).to(device)  # (1, seq, feature_dim)
            
            # Create src_key_padding_mask
            src_key_padding_mask = (input_tensor == pad_idx)  # (1, seq)
            
            # Get logits from the model
            logits = model(input_tensor, feature_tensor, src_key_padding_mask=src_key_padding_mask)  # (1, seq, vocab_size)
            
            # Get the last token's logits
            last_logits = logits[0, -1, :]  # (vocab_size)
            
            # Apply softmax to get probabilities
            probabilities = torch.softmax(last_logits, dim=0).cpu().numpy()
            
            # Sample the next token (greedy)
            next_token_id = np.argmax(probabilities)
            
            # Append to generated_ids and features
            generated_ids.append(next_token_id)
            generated_features.append(layer_features[id_to_layer[next_token_id]])
            
            # Check for <EOS>
            if next_token_id == layer_to_id["<EOS>"]:
                break
    
    # Convert generated IDs to layer names
    generated_sequence = [id_to_layer[id] for id in generated_ids]
    return generated_sequence

############################################################
# 11. Example of Generating a Sequence
############################################################

# Generate a sequence
generated_seq = generate_sequence(
    model=model,
    layer_to_id=layer_to_id,
    id_to_layer=id_to_layer,
    layer_features=layer_features,
    max_len=10,
    device=device
)

print("\nGenerated Sequence:")
print(generated_seq)

############################################################
# 12. Optional: Visualizing Embeddings with t-SNE
############################################################

def visualize_embeddings(model, layer_to_id, id_to_layer, device='cpu'):
    """
    Visualizes the token embeddings using t-SNE.
    
    Args:
        model (nn.Module): Trained Transformer model.
        layer_to_id (dict): Mapping from layer names to IDs.
        id_to_layer (dict): Mapping from IDs to layer names.
        device (torch.device): Device to run the model on.
    """
    model.eval()
    with torch.no_grad():
        # Get embeddings for all layers
        layer_ids = torch.arange(vocab_size).to(device)  # (vocab_size)
        embeddings = model.token_embedding(layer_ids)    # (vocab_size, embed_dim)
        embeddings = embeddings.cpu().numpy()
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    embeds_2d = tsne.fit_transform(embeddings)
    
    # Plot
    plt.figure(figsize=(12, 10))
    for idx, layer in id_to_layer.items():
        plt.scatter(embeds_2d[idx, 0], embeds_2d[idx, 1], label=layer, marker='o')
        plt.text(embeds_2d[idx, 0]+0.2, embeds_2d[idx, 1]+0.2, layer, fontsize=9)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title("Layer Embeddings Visualization with t-SNE")
    plt.xlabel("t-SNE Dimension 1")
    plt.ylabel("t-SNE Dimension 2")
    plt.grid(True)
    plt.show()

# Uncomment the following line to visualize embeddings
# visualize_embeddings(model, layer_to_id, id_to_layer, device=device)


Layer to ID Mapping:
<PAD>: 0
<SOS>: 1
<EOS>: 2
Au: 3
FTO: 4
HTL: 5
ITO: 6
Perovskite: 7
SLG: 8
Spiro-MeOTAD: 9
TiO2: 10
TiO2-c: 11

Layer Features:
<PAD>: [0.0, 0.0, 0.0, 0.0, 0.0]
<SOS>: [0.0, 0.0, 0.0, 0.0, 0.0]
<EOS>: [0.0, 0.0, 0.0, 0.0, 0.0]
Au: [0.3745401188473625, 0.9507143064099162, 0.7319939418114051, 0.5986584841970366, 0.15601864044243652]
FTO: [0.15599452033620265, 0.05808361216819946, 0.8661761457749352, 0.6011150117432088, 0.7080725777960455]
HTL: [0.020584494295802447, 0.9699098521619943, 0.8324426408004217, 0.21233911067827616, 0.18182496720710062]
ITO: [0.18340450985343382, 0.3042422429595377, 0.5247564316322378, 0.43194501864211576, 0.2912291401980419]
Perovskite: [0.6118528947223795, 0.13949386065204183, 0.29214464853521815, 0.3663618432936917, 0.45606998421703593]
SLG: [0.7851759613930136, 0.19967378215835974, 0.5142344384136116, 0.5924145688620425, 0.046450412719997725]
Spiro-MeOTAD: [0.6075448519014384, 0.17052412368729153, 0.06505159298527952, 0.9488855372533332



RuntimeError: The size of tensor a (160) must match the size of tensor b (128) at non-singleton dimension 2