### ChemBERT embeddings of PSC stacks

Using pre-trained ChemBERT models to capture chemical similarities/meaning between different materials in each layer of the PSC. Focusing exclusively on single-layered perovskites (>99% of the database).

1. The Perovskite layer

In [1]:
# embeddings for PSC cell stack: substrate, ETL, perovskite, HTL and backcontact 

import transformers
print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.48.0


1.1 Extract individual ions (including additives) from perovskite compositions

In [11]:
from transformers import AutoTokenizer, AutoModel

# single layer perovskites
# convert perovskite composition into SMILE strings

import pandas as pd
import re
import requests

df = pd.read_csv(r"C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\filtered_original_data.csv")

# Step 1: Filter Data to Keep Only Single Layered Rows
df_filtered = df[df["Layer Type"] == "Single-layered Perovskite"]

# Column of interest
column_name = "Perovskite_composition_short_form"

# Find unique entries and count them
unique_compositions = df_filtered[column_name].unique()
num_unique_entries = len(unique_compositions)

# Print results
print(f"Number of unique entries in '{column_name}': {num_unique_entries}")
# print("Unique entries:", unique_compositions)

import re
from pubchempy import get_compounds

# Placeholder for ion-to-SMILES dictionary
ion_to_smiles = {}

# Function to query PubChem for SMILES
def fetch_smiles(ion):
    try:
        compound = get_compounds(ion, 'name')
        if compound:
            return compound[0].canonical_smiles
        else:
            return None
    except Exception as e:
        print(f"Error fetching SMILES for ion {ion}: {e}")
        return None

# Function to extract ions from a composition
def extract_ions(composition):
    # Extract additive in parentheses
    additive_match = re.search(r'\((.*?)\)', composition)
    additive = additive_match.group(1) if additive_match else None
    # Remove the additive part
    base_comp = re.sub(r'\(.*?\)', '', composition)
    # Split base composition into individual ions
    ions = re.findall(r'[A-Z][a-z]?\d*|[A-Za-z]+', base_comp)  # Matches elements and ion groups
    if additive:
        ions.append(additive)  # Include additive
    return ions

# Process each composition
composition_to_smiles = {}
for comp in unique_compositions:
    # Step 1: Extract individual ions
    ions = extract_ions(comp)
    print(f"Extracted ions for {comp}: {ions}")
    
    # Step 2: Convert ions to SMILES
    smiles_list = []
    for ion in ions:
        if ion not in ion_to_smiles:
            # Fetch SMILES if not already in the dictionary
            smiles = fetch_smiles(ion)
            if smiles:
                ion_to_smiles[ion] = smiles
            else:
                ion_to_smiles[ion] = "UNKNOWN"  # Placeholder for unresolved ions
        smiles_list.append(ion_to_smiles[ion])
    
    # Step 3: Combine individual SMILES
    combined_smiles = ".".join(smiles_list)
    composition_to_smiles[comp] = combined_smiles

# Output results
print("\nIon-to-SMILES Mapping:")
for ion, smiles in ion_to_smiles.items():
    print(f"{ion}: {smiles}")

print("\nComposition-to-SMILES Mapping:")
for comp, smiles in composition_to_smiles.items():
    print(f"{comp}: {smiles}")

Number of unique entries in 'Perovskite_composition_short_form': 315
Extracted ions for CsSnI: ['Cs', 'Sn', 'I']
Extracted ions for CsSnBrI: ['Cs', 'Sn', 'Br', 'I']
Extracted ions for CsSnBr: ['Cs', 'Sn', 'Br']
Extracted ions for MAPbI: ['M', 'A', 'Pb', 'I']
Extracted ions for MAPbBr: ['M', 'A', 'Pb', 'Br']
Extracted ions for CsFAMAPbBrI: ['Cs', 'F', 'A', 'M', 'A', 'Pb', 'Br', 'I']
Extracted ions for CsFAGUMAPbBrI: ['Cs', 'F', 'A', 'G', 'U', 'M', 'A', 'Pb', 'Br', 'I']
Extracted ions for CsFAGUPbBrI: ['Cs', 'F', 'A', 'G', 'U', 'Pb', 'Br', 'I']
Extracted ions for MAPbBrI: ['M', 'A', 'Pb', 'Br', 'I']
Extracted ions for FAMAPbBrI: ['F', 'A', 'M', 'A', 'Pb', 'Br', 'I']
Extracted ions for CsPbBrI: ['Cs', 'Pb', 'Br', 'I']
Extracted ions for CsFAPbBrI: ['Cs', 'F', 'A', 'Pb', 'Br', 'I']
Extracted ions for FAPbI: ['F', 'A', 'Pb', 'I']
Extracted ions for CsMAPbBrI: ['Cs', 'M', 'A', 'Pb', 'Br', 'I']
Extracted ions for CsSbI: ['Cs', 'Sb', 'I']
Extracted ions for FASnI: ['F', 'A', 'Sn', 'I']
Extract

1.2 Automate SMILES mapping for each individual ions (using online database PubChem)

In [None]:
# Step 3: Automatically Populate the Ion-to-SMILES Dictionary
def fetch_smiles_from_pubchem(ion_name):
    """Fetch the SMILES string for a given ion from PubChem."""
    try:
        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ion_name}/property/CanonicalSMILES/JSON"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
        return smiles
    except Exception as e:
        print(f"Warning: SMILES not found for ion {ion_name}. Error: {e}")
        return None

# Create initial dictionary with None values
ion_to_smiles = {ion: None for ion in unique_ions}

# Fetch SMILES for each ion
for ion in ion_to_smiles:
    ion_to_smiles[ion] = fetch_smiles_from_pubchem(ion)

# Check the completed dictionary
print("Ion-to-SMILES Dictionary:", ion_to_smiles)

# Step 4: Parse Compositions and Convert to SMILES
def parse_composition_to_smiles(composition, ion_to_smiles):
    """Convert a perovskite composition into a SMILES string."""
    # Split the composition into ions and coefficients
    ion_pattern = r'[A-Za-z]+'  # Matches ion names like MA, FA, Cs, etc.
    coeff_pattern = r'[0-9.]+'
    ions = re.findall(ion_pattern, composition)
    coefficients = re.findall(coeff_pattern, composition)

    if not coefficients:  # Handle cases like "MAPbI3" with implicit coefficients
        coefficients = [1] * len(ions)
    else:
        coefficients = list(map(float, coefficients))

    # Combine ions and coefficients into a single SMILES string
    smiles_parts = []
    for ion, coeff in zip(ions, coefficients):
        if ion in ion_to_smiles and ion_to_smiles[ion]:
            smiles = ion_to_smiles[ion]
            # Add the coefficient next to the SMILES string
            smiles_parts.append(f"{smiles}({coeff})")
        else:
            print(f"Warning: SMILES not found for ion {ion}")
            return None

    return ".".join(smiles_parts)

# Apply the conversion to the filtered DataFrame
df_filtered["Perovskite_SMILES"] = df_filtered["Perovskite_composition_short_form"].apply(
    lambda x: parse_composition_to_smiles(x, ion_to_smiles)
)

# Output the updated DataFrame
print(df_filtered)

1.3 Combine SMILES for each material (concatenate SMILES of the individual ions)

### Method 2 - Using individual ion columns to create perovskite SMILES
1. Create a dictionary to convert ion names to their respective SMILES
2. Weight the SMILES by coefficients in perovskite material
3. Concatenate SMILES in each perovskite 

1. Take data_with_layer_type_and_combined.csv, replacing the ion columns with SMILE columns (for the perovskite layer) and removing the 'combined' columns. Updated dataframe is saved as bert_test_data.csv.

In [37]:
import pandas as pd
import pubchempy as pcp

# Load the dataset
df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\data_with_layer_type_and_combined.csv')

# Specify the range of ion columns
start_column = "NMA"  # Replace with the name of your start column
end_column = "CH3ND3"  # Replace with the name of your end column

# Extract the ion columns
columns = list(df.columns)
start_idx = columns.index(start_column)
end_idx = columns.index(end_column)
ion_columns = columns[start_idx:end_idx + 1]  # Include both start and end columns

# Function to fetch SMILES from PubChem
def fetch_smiles(ion):
    try:
        # Search PubChem for the ion and get the first result's SMILES
        compound = pcp.get_compounds(ion, 'name')
        if compound:
            return compound[0].canonical_smiles
        else:
            return None
    except Exception as e:
        print(f"Error fetching SMILES for {ion}: {e}")
        return None

# Populate ion-to-SMILES dictionary dynamically for the ion_columns
missing_smiles_count = 0
ion_to_smiles = {}
ions_with_missing_smiles = set()

for ion in ion_columns:
    smiles = fetch_smiles(ion)
    if smiles:
        ion_to_smiles[ion] = smiles
    else:
        ions_with_missing_smiles.add(ion)
        missing_smiles_count += 1

# Print the missing SMILES warning
print(f"\nTotal warnings for missing SMILES: {missing_smiles_count}")
print(f"Ions with missing SMILES: {ions_with_missing_smiles}")

# Filter out rows containing ions with missing SMILES
rows_with_missing_ions = df[ion_columns].apply(
    lambda row: any(ion in ions_with_missing_smiles for ion in row.index if row[ion] > 0), axis=1
)
cleaned_df = df[~rows_with_missing_ions].copy()  # Keep only rows without missing ions

# Remove ion columns with missing SMILES
valid_ion_columns = [col for col in ion_columns if col not in ions_with_missing_smiles]
cleaned_df = cleaned_df[valid_ion_columns + [col for col in df.columns if col not in ion_columns]]

# Rename ion column names to their corresponding SMILES
cleaned_df.rename(columns=ion_to_smiles, inplace=True)

# List the column names to be dropped
columns_to_drop = ['combined_ions', 'combined_coefficients', 'combined_sites']  # Replace with the actual column names

# Drop the specified columns from the DataFrame
cleaned_df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Save the updated DataFrame to CSV
cleaned_df.to_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\bert_test_data.csv', index=False)

print(f"\nColumns {columns_to_drop} have been removed from the DataFrame.")


# Save the cleaned DataFrame with renamed columns to a new CSV
cleaned_df.to_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\bert_test_data.csv', index=False)

print(f"\nRows containing ions with missing SMILES have been removed.")
print(f"Ion columns with missing SMILES have been removed.")
print(f"Ion column names have been converted to SMILES.")
print(f"Cleaned DataFrame saved as 'bert_test_data.csv'.")


  df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\data_with_layer_type_and_combined.csv')



Ions with missing SMILES: {'ALA', 'DAT', 'C4H9N2H6', 'ODA', 'PyEA', 'Aa', 'BzDA', 'TMA', '4AMP', 'BZA', 'FA', 'C6H13NH3', 'Br-PEA', 'TFEA', 'n-C3H7NH3', '3AMP', 'pF1PEA', 'Anyl', '4FPEA', 'ImEA', 'iso-BA', 'F5PEA', 'C8H17NH3', 'DI', 'mFPEA', '3AMPY', 'APMim', 'Cl-PEA', 'CH3ND3', 'HAD', 'N-EtPy', 'H-PEA', 'FPEAI', 'TEA', 'iPA', 'DA', 'oF1PEA', 'EU-pyP', 'EDA', 'HdA', 'TN', 'F-PEA', 'C4H9NH3', 'PEA', '4AMPY', 'HTAB', 'C6H4NH2', 'NMA', 'CIEA', 'oFPEA', 'OA', 'F3EA', 'BdA', 'MIC3', 'BU', 'CH33S', 'mF1PEA', 'OdA', 'MIC1', 'CHMA', 'A43', '3-PrNH32', 'PyrEA', 'Ace', 'BIM', 'ThMA', 'f-PEA', 'DAP', 'PF6', 'TBA', 'IM', '4ApyH', 'NMABr', 'DPA', 'PPEA', 'PDMA', '5-AVA', 'MTEA', 'PPA', 'HA', 'HDA', 'EA', 'BDA'}

Columns ['combined_ions', 'combined_coefficients', 'combined_sites'] have been removed from the DataFrame.

Rows containing ions with missing SMILES have been removed.
Ion columns with missing SMILES have been removed.
Ion column names have been converted to SMILES.
Cleaned DataFrame saved

2. Combine the perovskite SMILES to obtain a single, weighted SMILES for each row/perovskite layer. Saved as combined_smiles_bert_test_data.csv.

In [47]:
df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\bert_test_data.csv')

import pandas as pd

start_column = "C1=CC=C(C=C1)C(=O)NCCC(=O)O"  # Replace with the name of your start column
end_column = "C(#N)[S-]"  # Replace with the name of your end column

# Extract the ion columns
columns = list(df.columns)
start_idx = columns.index(start_column)
end_idx = columns.index(end_column)
smiles_columns = columns[start_idx:end_idx + 1]  # Include both start and end columns

# # Ensure all coefficients in the SMILES columns are numeric
# df[smiles_columns] = df[smiles_columns].apply(pd.to_numeric, errors='coerce').fillna(0)

# Function to generate weighted SMILES for each row, considering only SMILES columns
def generate_weighted_smiles(row):
    smiles_list = []
    for smiles in smiles_columns:  # Iterate only over SMILES columns
        coefficient = row[smiles]
        if coefficient > 0:  # Only include non-zero coefficients
            weighted_smiles = f"{coefficient}*{smiles}"  # Combine coefficient and SMILES
            smiles_list.append(weighted_smiles)
    return ".".join(smiles_list)  # Concatenate weighted SMILES with '.' as separator

# Apply the function to generate the weighted SMILES column, considering only the SMILES columns
df['weighted_perovskite_smiles'] = df[smiles_columns].apply(generate_weighted_smiles, axis=1)

# Print the updated DataFrame to verify
print(df[['weighted_perovskite_smiles']].head())

# Save the cleaned DataFrame with renamed columns to a new CSV
df.to_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\combined_smiles_bert_data.csv', index=False)

  df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\bert_test_data.csv')


                         weighted_perovskite_smiles
0                          1.0*II.1.0*[Sn].1.0*[Cs]
1  0.9*II.1.0*[Sn].1.0*[Cs].0.0999999999999999*BrBr
2                 0.5*II.1.0*[Sn].1.0*[Cs].0.5*BrBr
3  0.0999999999999999*II.1.0*[Sn].1.0*[Cs].0.9*BrBr
4                        1.0*[Sn].1.0*[Cs].1.0*BrBr


3. Handle the ETL, HTL and backcontact layers - convert the ions/elements in these to SMILES.

In [None]:
## ETL

## HTL

## Backcontact

Drop columns with > 95% missing values.

In [None]:
# drop columns with > 95% missing values

columns_to_drop = ['Substrate_thickness', 'ETL_additives_concentrations', 'HTL_additives_concentrations', 'Backcontact_additives_concentrations'
'Backcontact_additives_compounds', 'Add_lay_front_function', 'Add_lay_front_thickness', 'Add_lay_front_additives_compounds'
'Add_lay_front_additives_concentrations', 'Add_lay_back_funuction', 'Add_lay_back_thickness_list', 'Add_lay_back_additives_concentrations'
'Add_lay_back_additives_compounds']

# Drop the columns from the DataFrame
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Optionally, print to verify the columns were dropped
print(df.head())
