This notebook provides a comprehensive guide to using the trained AtomGPT model for making predictions on new atomic structure descriptions. The key steps include:

Setup and Imports: Import necessary libraries and set the computation device.

Load Configuration: Load the training configuration to ensure consistency.

Initialize Tokenizer and Model: Set up the tokenizer and model architecture, then load the trained weights.

Define the Prediction Dataset Class: Create a dataset class to handle input descriptions.

Prepare Input Data for Prediction: Provide the descriptions for which you want to predict properties.

Create DataLoader for Predictions: Batch the input data for efficient processing.

Make Predictions: Pass the input through the model to obtain predictions.

Process and Save Predictions: Organize the predictions and save them for further analysis.

(Optional) Visualize Predictions: Visualize the results to gain insights.

Note: Ensure that all file paths (e.g., config_path, input_descriptions.csv) are correctly set based on your project structure. Additionally, adjust the input_descriptions list to include the actual descriptions you wish to predict.

In [None]:
# Import necessary libraries
import os
import json
import torch
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, T5ForConditionalGeneration
from jarvis.db.jsonutils import loadjson, dumpjson
from jarvis.core.atoms import Atoms
from tqdm import tqdm

# Set device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
# Path to the configuration file
config_path = "out_temp/config.json"  # Update this path as needed

# Load the configuration
config = loadjson(config_path)
print("Loaded Configuration:")
print(json.dumps(config, indent=4))


In [None]:
# Extract necessary configurations
model_name = config['model_name']
pretrained_path = os.path.join(config['output_dir'], "best_model.pt")
desc_type = config['desc_type']

# Initialize the tokenizer based on the model
if "t5" in model_name:
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

# Add special tokens if they were added during training
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    tokenizer.add_special_tokens({"unk_token": "#"})
    tokenizer.add_special_tokens({"unk_token": "&"})
    tokenizer.add_special_tokens({"unk_token": "@"})
    model.resize_token_embeddings(len(tokenizer))

# Modify the model's language modeling head for regression (same as training)
latent_dim = config['latent_dim']
model.lm_head = torch.nn.Sequential(
    torch.nn.Linear(model.config.hidden_size, latent_dim),
    torch.nn.Linear(latent_dim, 1),
)

# Load the trained model weights
model.load_state_dict(torch.load(pretrained_path, map_location=device))
model.to(device)
model.eval()  # Set model to evaluation mode

print("Model and tokenizer loaded successfully.")


In [None]:
class PredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        """Initialize the dataset with texts and tokenizer."""
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        """Return the total number of samples."""
        return len(self.texts)

    def __getitem__(self, idx):
        """Retrieve a single sample by index."""
        inputs = self.tokenizer(
            self.texts[idx],
            return_tensors="pt",
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
        )
        return inputs


In [None]:
# Example input descriptions
input_descriptions = [
    "Description of material 1 with atomic structure details...",
    "Description of material 2 with atomic structure details...",
    # Add more descriptions as needed
]

# Alternatively, load descriptions from a JSON or CSV file
# For example:
# descriptions_df = pd.read_csv("input_descriptions.csv")
# input_descriptions = descriptions_df['desc'].tolist()

print(f"Number of input descriptions: {len(input_descriptions)}")


In [None]:
# Initialize the prediction dataset and dataloader
prediction_dataset = PredictionDataset(
    texts=input_descriptions,
    tokenizer=tokenizer,
    max_length=config['max_length']
)

batch_size = config['batch_size']  # Use the same batch size as training

prediction_dataloader = DataLoader(prediction_dataset, batch_size=batch_size)


In [None]:
# Initialize a list to store predictions
predictions = []

# Disable gradient computation for inference
with torch.no_grad():
    for batch in tqdm(prediction_dataloader, desc="Making Predictions"):
        input_ids = batch['input_ids'].squeeze(1).to(device)  # Remove extra dimension and move to device

        # Forward pass through the model
        if "t5" in model_name:
            outputs = model(
                input_ids=input_ids,
                decoder_input_ids=input_ids
            )
            logits = outputs.logits.squeeze(-1).mean(dim=-1)
        else:
            outputs = model(input_ids=input_ids)
            logits = outputs.logits.squeeze(-1).mean(dim=-1)

        # Apply the regression head
        regression_output = model.lm_head(logits)

        # Move predictions to CPU and convert to numpy
        preds = regression_output.cpu().numpy().tolist()
        predictions.extend(preds)

print("Predictions completed.")


In [None]:
# Create a DataFrame with input descriptions and their corresponding predictions
results_df = pd.DataFrame({
    'Description': input_descriptions,
    'Predicted_Property': predictions
})

# Display the results
print(results_df)

# Save the predictions to a CSV file
output_predictions_path = os.path.join(config['output_dir'], "predictions.csv")
results_df.to_csv(output_predictions_path, index=False)
print(f"Predictions saved to {output_predictions_path}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Plot distribution of predicted properties
plt.figure(figsize=(10, 6))
sns.histplot(results_df['Predicted_Property'], bins=30, kde=True)
plt.title('Distribution of Predicted Properties')
plt.xlabel('Predicted Property')
plt.ylabel('Frequency')
plt.show()

# If you have actual properties for comparison:
# results_df['Actual_Property'] = actual_properties_list
# sns.scatterplot(data=results_df, x='Actual_Property', y='Predicted_Property')
# plt.title('Actual vs Predicted Properties')
# plt.xlabel('Actual Property')
# plt.ylabel('Predicted Property')
# plt.show()
