### ChemBERT
1. Initial data cleaning and preprocessing of Perovskite database query.csv
- Filtering for relevant columns - features and key performance metrics

In [10]:
import pandas as pd

df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Data\Perovsite database query.csv')

# relevant columns listed below, any with >85% missing values have been removed

columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence',
    'ETL_stack_sequence',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_composition_short_form', 'Perovskite_composition_long_form', 'Perovskite_composition_leadfree', 'Perovskite_composition_inorganic',
    'Perovskite_additives_compounds', 'Perovskite_thickness', 'Perovskite_band_gap',
    'HTL_stack_sequence', 'HTL_additives_compounds',
    'Backcontact_stack_sequence',
    'Encapsulation', 'Encapsulation_stack_sequence', 'JV_default_PCE',
    'JV_default_Voc', 'JV_default_Jsc', 'JV_default_FF', 'JV_hysteresis_index', 'Stabilised_performance_measured', 'Stability_measured',
    'Stability_average_over_n_number_of_cells', 'Stability_light_source_type', 'Stability_light_intensity', 'Stability_atmosphere', 
    'Stability_time_total_exposure', 'Stability_PCE_end_of_experiment'
]

df = df[columns_to_keep]

df.columns = df.columns.str.strip()

ion_columns = ['Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients']

columns_to_check = ['Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_composition_short_form', 'Perovskite_composition_long_form']

df = df.dropna(subset=columns_to_check)

# Filter for single-layered perovskites only (>98%)

df['Layer Type'] = df.apply(
    lambda row: 'Multi-layered Perovskite' if any('|' in str(row[col]) for col in ion_columns) else 'Single-layered Perovskite',
    axis=1
)
df = df[df['Layer Type'] == 'Single-layered Perovskite']

# Filter out any HTL-free devices

df = df[df['HTL_stack_sequence'] != 'none']

output_file_path = 'initial_cleaned_data.csv'

df.to_csv(output_file_path, index=False)

  df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Data\Perovsite database query.csv')


### SMILES - Data Preprocessing
Feature engineering/Preparing our data for the ChemBERTa model - creating SMILES string representations for each layer of the PSC device. These SMILES strings will be converted into numerical, vector representations/embeddings (by ChemBERTa) - which we will subsequently use to train both regression (for prediction) and generative models.

1. Converting the substrate, perovskite, ETL, HTL and backcontact layers/compositions into SMILES strings.

In [12]:
## version 1

import pandas as pd
import pubchempy as pcp
import re
from collections import defaultdict

# Define the columns of interest
psc_stack_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 
    'Perovskite_composition_c_ions', 'Substrate_stack_sequence', 
    'ETL_stack_sequence', 'HTL_stack_sequence', 'Backcontact_stack_sequence'
]

# Initialize a local cache for SMILES strings
smiles_cache = defaultdict(lambda: None)

# # Function to fetch SMILES strings for a list of unique ions
# def fetch_smiles(ions):
#     for ion in ions:
#         if ion not in smiles_cache:  # Avoid redundant API calls
#             try:
#                 compounds = pcp.get_compounds(ion, 'name')  # Query PubChem
#                 if compounds:
#                     smiles_cache[ion] = compounds[0].canonical_smiles
#                 else:
#                     print(f"No SMILES found for ion: {ion}")
#                     smiles_cache[ion] = None
#             except Exception as e:
#                 print(f"Error fetching SMILES for ion '{ion}': {e}")
#                 smiles_cache[ion] = None

# Function to convert ion strings to combined SMILES
def convert_to_smiles(ion_str):
    if pd.isna(ion_str) or str(ion_str).strip().lower() in ['none', '']:
        return None
    # Split ions by ; or |, and strip whitespace
    ions = [ion.strip() for ion in re.split(r'[;|]', ion_str)]
    # Fetch SMILES for each ion, skipping any that aren't found
    smiles_list = [smiles_cache[ion] for ion in ions if smiles_cache[ion] is not None]
    # Combine SMILES strings with a "." separator
    return '.'.join(smiles_list) if smiles_list else None

# Load your dataset (replace 'your_dataset.csv' with your actual file path)
df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\initial_cleaned_data.csv')

# Step 1: Extract all unique ions from the specified columns
all_ions = set()
for col in psc_stack_columns:
    if col in df.columns:  # Ensure the column exists in the dataset
        df[col].dropna().apply(lambda x: all_ions.update([ion.strip() for ion in re.split(r'[;|]', x)]))

# # Step 2: Fetch SMILES strings for all unique ions
# fetch_smiles(all_ions)

# # Step 3: Create new SMILES columns for each of the psc_stack_columns
# for col in psc_stack_columns:
#     if col in df.columns:  # Ensure the column exists
#         smiles_col = col + '_SMILES'
#         df[smiles_col] = df[col].apply(convert_to_smiles)

# # Save the updated dataframe to a new CSV file
# output_file_path = 'cleaned_data_with_smiles.csv'
# df.to_csv(output_file_path, index=False)

# # Print a preview of the dataframe
# print(df.head())

# print(f"Number of distinct ions: {len(all_ions)}")



# Initialize a list to store ions with no SMILES found
error_ions = []

# Function to fetch SMILES strings for a list of unique ions
def fetch_smiles(ions):
    for ion in ions:
        if ion not in smiles_cache:  # Avoid redundant API calls
            try:
                compounds = pcp.get_compounds(ion, 'name')  # Query PubChem
                if compounds and compounds[0].canonical_smiles:
                    smiles_cache[ion] = compounds[0].canonical_smiles
                else:
                    print(f"No SMILES found for ion: {ion}")
                    smiles_cache[ion] = None
                    error_ions.append(ion)  # Collect ion with no SMILES
            except Exception as e:
                print(f"Error fetching SMILES for ion '{ion}': {e}")
                smiles_cache[ion] = None
                error_ions.append(ion)  # Collect ion with fetch error

# After fetching SMILES, check error ions against the dataset
def check_error_ions_against_dataset(df, error_ions):
    ions_not_found_in_dataset = []
    for ion in error_ions:
        # Check if the ion is present in any column of the dataset
        columns_with_ion = df.columns[df.isin([ion]).any()].tolist()
        if not columns_with_ion:  # If the ion is not found in any column
            ions_not_found_in_dataset.append(ion)

    # Print the results
    print(f"Number of ions in the error message that are NOT present in the dataset: {len(ions_not_found_in_dataset)}")
    print(f"Ions not found in the dataset: {ions_not_found_in_dataset}")

# Example usage:
# unique_ions = df[psc_stack_columns].unique().tolist()  # Replace 'IonColumn' with the column containing ions# unique_ions = pd.unique(df[['IonColumn1', 'IonColumn2']].values.ravel()).tolist()
fetch_smiles(all_ions)  # Fetch SMILES and collect error ions
check_error_ions_against_dataset(df, error_ions)  # Check error ions against the dataset



  df = pd.read_csv(r'C:\Users\c\OneDrive\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Vector embedding\initial_cleaned_data.csv')


No SMILES found for ion: 2PDI-4S
No SMILES found for ion: AZO-np
No SMILES found for ion: CuGaO2-np
No SMILES found for ion: IDF-SFXPh
No SMILES found for ion: PDI
No SMILES found for ion: diPDI
No SMILES found for ion: WPF‐6‐oxy‐F
No SMILES found for ion: Poly(2-ethyl-2-oxazoline)
No SMILES found for ion: C202
No SMILES found for ion: TiO2-nanoplatelets
No SMILES found for ion: (TEA)
No SMILES found for ion: SGT-409
No SMILES found for ion: CSCNT
No SMILES found for ion: RE-ZnBu4Pc
No SMILES found for ion: HBZ-70
No SMILES found for ion: VB-DAAF
No SMILES found for ion: pi-PFE4
No SMILES found for ion: Poly(ethylene oxide)
No SMILES found for ion: 1d @ triphenylamine modified azobenzene dyes
No SMILES found for ion: TP1
No SMILES found for ion: BL51
No SMILES found for ion: PFN-OX
No SMILES found for ion: H2-Chl
No SMILES found for ion: Ta2O5
No SMILES found for ion: 2FBTA-2
No SMILES found for ion: Willow glas
No SMILES found for ion: MgF2
No SMILES found for ion: s-PANI:PSS
No SMILE

In [13]:
target_ion = 'VB'
columns_with_ion = df.columns[df.isin([target_ion]).any()].tolist()
print(f"The ion '{target_ion}' is found in: {columns_with_ion}")



The ion 'VB' is found in: []


In [2]:
## version 2

import pandas as pd
import pubchempy as pcp
import json
import time
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

# Define columns of interest
psc_stack_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 
    'Perovskite_composition_c_ions', 'Substrate_stack_sequence', 
    'ETL_stack_sequence', 'HTL_stack_sequence', 'Backcontact_stack_sequence'
]

# Initialize cache
smiles_cache = {}

# Try to load the cache from a file if it exists
cache_file = 'smiles_cache.json'
try:
    with open(cache_file, 'r') as f:
        smiles_cache = json.load(f)
        print("Loaded smiles cache from file.")
except FileNotFoundError:
    print("Cache file not found. Fetching SMILES from PubChem.")

# Function to fetch SMILES from PubChem with retries
def fetch_smiles_from_pubchem(ion, retries=3, delay=2):
    """
    Fetch SMILES for a given ion from PubChem, with retries in case of server issues.
    """
    try:
        compound = pcp.get_compounds(ion, 'name')
        if compound and compound[0].isomeric_smiles:
            return compound[0].isomeric_smiles
        else:
            return None  # No SMILES found
    except Exception as e:
        if retries > 0:
            print(f"Error fetching SMILES for ion '{ion}': {e}. Retrying...")
            time.sleep(delay)  # Wait before retrying
            return fetch_smiles_from_pubchem(ion, retries-1, delay)
        else:
            print(f"Error fetching SMILES for ion '{ion}': {e}. Skipping.")
            return None

# Function to fetch SMILES in batch for a list of ions
def batch_fetch_smiles(ions):
    """
    Populate the smiles_cache with SMILES for each ion using multiple threads.
    """
    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(fetch_smiles_from_pubchem, ions))
        for ion, smiles in zip(ions, results):
            if smiles:
                smiles_cache[ion] = smiles

# Function to convert Ion column to SMILES using the cache
def convert_to_smiles(ion_str):
    """
    Converts a string of ions into their SMILES representation.
    Handles multiple ions separated by ':' or '|'.
    """
    if pd.isna(ion_str):  # Handle missing values
        return None
    
    # Normalize delimiters to ':' and split the ions
    normalized_str = ion_str.replace('|', ':')
    ions = normalized_str.split(':')
    
    # Use the cache to fetch SMILES
    smiles_list = [smiles_cache.get(ion.strip()) for ion in ions if smiles_cache.get(ion.strip()) is not None]
    return '.'.join(smiles_list) if smiles_list else None

# Load dataset (replace with your actual file path)
df = pd.read_csv('initial_cleaned_data.csv')

# Step 1: Collect all unique ions from the dataset columns
all_ions = set()
for col in psc_stack_columns:
    if col in df.columns:  # Ensure the column exists in the dataset
        df[col].dropna().apply(lambda x: all_ions.update([ion.strip() for ion in x.replace('|', ':').split(':')]))

# Step 2: Fetch SMILES for all unique ions
batch_fetch_smiles(all_ions)

# Step 3: Apply the conversion to each specified column
for col in psc_stack_columns:
    if col in df.columns:  # Ensure the column exists
        smiles_col = col + '_SMILES'
        df[smiles_col] = df[col].apply(convert_to_smiles)

# Save the updated dataframe to a new CSV file
output_file_path = 'cleaned_data_with_smiles.csv'
df.to_csv(output_file_path, index=False)

# # Save the smiles cache to a file for future use
# with open(cache_file, 'w') as f:
#     json.dump(smiles_cache, f)
#     print("Smiles cache saved to file.")


Loaded smiles cache from file.


  df = pd.read_csv('initial_cleaned_data.csv')


Error fetching SMILES for ion 'PDI': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'Ti': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'WPF‐6‐oxy‐F': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'BA; FA': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'Nb2O5': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion '2FBTA-2': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'Ta2O5': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'H2-Chl': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion '3,8,13-tris[2,2-bis(4-methoxyphenyl)ethenyl]-5,10,15-triethyl-10,15-dihydro-5H-indolo-[3,2-a': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'M109': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'PDMS': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for ion 'Fullerene @ F3': 'PUGREST.ServerBusy'. Retrying...
Error fetching SMILES for

KeyboardInterrupt: 

In [1]:
## check 

import pubchempy as pcp

# Define the ions you want to check
ions = ['Na+', 'Cl-']

# Function to fetch SMILES for a given ion
def fetch_smiles_for_ions(ions):
    for ion in ions:
        try:
            # Query PubChem for the ion by name
            compound = pcp.get_compounds(ion, 'name')
            if compound:
                print(f"SMILES for {ion}: {compound[0].canonical_smiles}")
            else:
                print(f"No SMILES found for ion: {ion}")
        except Exception as e:
            print(f"Error fetching SMILES for {ion}: {e}")

# Test the function with two known ions
fetch_smiles_for_ions(ions)


SMILES for Na+: [Na+]
SMILES for Cl-: [Cl-]


2. Combining the 3 perovskite ion SMILES columns into one 

In [22]:
import pandas as pd
import pubchempy as pcp
import json
from concurrent.futures import ThreadPoolExecutor
import time
import pandas as pd
import pubchempy as pcp
import json
from concurrent.futures import ThreadPoolExecutor

df = pd.read_csv(r'cleaned_data_with_smiles.csv')

# Initialize an empty cache dictionary or load from a cache file
smiles_cache = {}

# Try to load the cache from a file if it exists
cache_file = 'smiles_cache.json'
try:
    with open(cache_file, 'r') as f:
        smiles_cache = json.load(f)
        print("Loaded smiles cache from file.")
except FileNotFoundError:
    print("Cache file not found. Fetching SMILES from PubChem.")

# Function to fetch SMILES from PubChem with retries
def fetch_smiles_from_pubchem(ion, retries=3, delay=2):
    """
    Fetch SMILES for a given ion from PubChem, with retries in case of server issues.
    """
    try:
        compound = pcp.get_compounds(ion, 'name')
        if compound and compound[0].isomeric_smiles:
            return compound[0].isomeric_smiles
        else:
            return None  # No SMILES found
    except Exception as e:
        if retries > 0:
            print(f"Error fetching SMILES for ion '{ion}': {e}. Retrying...")
            time.sleep(delay)  # Wait before retrying
            return fetch_smiles_from_pubchem(ion, retries-1, delay)
        else:
            print(f"Error fetching SMILES for ion '{ion}': {e}. Skipping.")
            return None

# Function to fetch SMILES in batch for a list of ions
def batch_fetch_smiles(ions):
    """
    Populate the smiles_cache with SMILES for each ion using multiple threads.
    """
    with ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(fetch_smiles_from_pubchem, ions))
        for ion, smiles in zip(ions, results):
            if smiles:
                smiles_cache[ion] = smiles

# Function to convert Ion column to SMILES using the cache
def convert_to_smiles(ion_str):
    """
    Converts a string of ions into their SMILES representation.
    Handles multiple ions separated by ':' or '|'.
    """
    if pd.isna(ion_str):  # Handle missing values
        return None
    
    # Normalize delimiters to ':' and split the ions
    normalized_str = ion_str.replace('|', ':')
    ions = normalized_str.split(':')
    
    # Use the cache to fetch SMILES
    smiles_list = [smiles_cache.get(ion.strip()) for ion in ions if smiles_cache.get(ion.strip()) is not None]
    return '.'.join(smiles_list)

# Specify the columns to process (e.g., ETL, HTL, etc.)
other_psc_layers = ['Substrate_stack_sequence', 'Backcontact_stack_sequence'] # 'ETL_stack_sequence', 'HTL_stack_sequence']

# Collect all unique ions from the dataset
all_ions = []
for col in other_psc_layers:
    df[col].dropna().apply(lambda x: all_ions.extend(x.replace('|', ':').split(':')))  # Normalize and split

# Remove duplicates
unique_ions = list(set(all_ions))

# Fetch SMILES for all unique ions and populate the cache
batch_fetch_smiles(unique_ions)

# Apply the conversion to each column
for col in other_psc_layers:
    df[col + '_SMILES'] = df[col].apply(convert_to_smiles)

# Save the updated DataFrame to a CSV file
output_file_path = 'cleaned_data_with_smiles.csv'
df.to_csv(output_file_path, index=False)

# Save the smiles_cache to a file for future use
with open(cache_file, 'w') as f:
    json.dump(smiles_cache, f)

print(f"Saved the cleaned data with SMILES to {output_file_path}.")

  df = pd.read_csv(r'cleaned_data_with_smiles.csv')


Loaded smiles cache from file.
Error fetching SMILES for ion 'PDMS ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion ' CuNW ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion 'Nanopaper ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion ' Ag-mesh': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion 'PSSl ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion 'SnO2-c ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion ' TiO2': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion ' IZO ': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...
Error fetching SMILES for ion ' Ag-mesh': <urlopen error [Errno 11001] getaddrinfo failed>. Retrying...Error fetching SMILES for ion 'PDMS ': <urlopen

In [5]:
import pandas as pd
import pubchempy as pcp
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor

# Initialize an empty cache dictionary or load from a cache file
smiles_cache = {}
cache_file = 'smiles_cache.json'

# Load the cache from a file if it exists
try:
    with open(cache_file, 'r') as f:
        smiles_cache = json.load(f)
        print("Loaded SMILES cache from file.")
except FileNotFoundError:
    print("Cache file not found. Starting with an empty cache.")

# Function to fetch SMILES from PubChem with retries and exponential backoff
def fetch_smiles_from_pubchem(ion, retries=5, delay=2):
    """
    Fetch SMILES for a given ion from PubChem, with retries and exponential backoff.
    """
    for attempt in range(retries):
        try:
            compound = pcp.get_compounds(ion, 'name')
            if compound and compound[0].isomeric_smiles:
                return compound[0].isomeric_smiles
            else:
                return None  # No SMILES found
        except Exception as e:
            if attempt < retries - 1:  # Retry if attempts remain
                wait_time = delay * (2 ** attempt) + random.uniform(0, 1)  # Add jitter
                print(f"Error fetching SMILES for ion '{ion}': {e}. Retrying in {wait_time:.2f} seconds...")
                time.sleep(wait_time)
            else:
                print(f"Error fetching SMILES for ion '{ion}': {e}. No more retries.")
                return None

# Function to fetch SMILES in batch
def fetch_smiles_batch(ions, retries=5):
    """
    Fetch SMILES for a batch of ions.
    """
    results = {}
    for ion in ions:
        if ion not in smiles_cache:
            smiles = fetch_smiles_from_pubchem(ion, retries=retries)
            if smiles:
                results[ion] = smiles
    return results

# Batch processing with limited concurrency
def batch_fetch_smiles(ions, batch_size=10):
    """
    Fetch SMILES for ions in batches, limiting the number of requests.
    """
    for i in range(0, len(ions), batch_size):
        batch = ions[i:i + batch_size]
        batch_results = fetch_smiles_batch(batch)
        smiles_cache.update(batch_results)

# Function to convert Ion column to SMILES using the cache
def convert_to_smiles(ion_str):
    """
    Converts a string of ions into their SMILES representation.
    Handles multiple ions separated by ':' or '|'.
    """
    if pd.isna(ion_str):  # Handle missing values
        return None
    
    # Normalize delimiters to ':' and split the ions
    normalized_str = ion_str.replace('|', ':')
    ions = normalized_str.split(':')
    
    # Use the cache to fetch SMILES
    smiles_list = [smiles_cache.get(ion.strip()) for ion in ions if smiles_cache.get(ion.strip()) is not None]
    return '.'.join(smiles_list)

# Specify the columns to process
other_psc_layers = ['Substrate_stack_sequence', 'ETL_stack_sequence', 'HTL_stack_sequence', 'Backcontact_stack_sequence']  # Update column names if needed

# Example DataFrame for demonstration (replace this with your actual DataFrame)
# Ensure the column names in your DataFrame match `other_psc_layers`
df = pd.read_csv(r'cleaned_data_with_smiles.csv')

# Collect all unique ions from the dataset
all_ions = []
for col in other_psc_layers:
    if col in df.columns:  # Check if the column exists in the DataFrame
        df[col].dropna().apply(lambda x: all_ions.extend(x.replace('|', ':').split(':')))  # Normalize and split

# Remove duplicates
unique_ions = list(set(all_ions))

# Fetch SMILES for all unique ions in batches
batch_fetch_smiles(unique_ions)

# Apply the conversion to each column
for col in other_psc_layers:
    if col in df.columns:  # Check if the column exists
        df[col + '_SMILES'] = df[col].apply(convert_to_smiles)

# Save the updated DataFrame to a CSV file
output_file_path = 'cleaned_data_with_smiles.csv'
df.to_csv(output_file_path, index=False)

# Save the smiles_cache to a file for future use
with open(cache_file, 'w') as f:
    json.dump(smiles_cache, f)

print(f"Saved the cleaned data with SMILES to {output_file_path}.")

Loaded SMILES cache from file.


  df = pd.read_csv(r'cleaned_data_with_smiles.csv')


KeyboardInterrupt: 

### Generating vector embeddings from SMILES strings

In [24]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load ChemBERTa model and tokenizer from Hugging Face
model_name = "seyonec/ChemBERTa-zinc-base-v1"  # Change model if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to combine individual SMILES in a column into a single string
def combine_column_smiles(smiles_str):
    if pd.isna(smiles_str):  # Handle missing values
        return None
    # Split by '.' if there are multiple SMILES in the same cell
    smiles_list = smiles_str.split('.')
    # Combine the SMILES in the cell (you can join them by '.' or other delimiters)
    return '.'.join([smiles for smiles in smiles_list if smiles.strip()])

def combine_row_smiles(row):
    # Access the SMILES for each composition by column name
    combined_smiles_1 = combine_column_smiles(row['Perovskite_composition_a_ions_SMILES'])
    combined_smiles_2 = combine_column_smiles(row['Perovskite_composition_b_ions_SMILES'])
    combined_smiles_3 = combine_column_smiles(row['Perovskite_composition_c_ions_SMILES'])
    
    # Combine the SMILES from all columns into one single SMILES string
    return '.'.join(filter(None, [combined_smiles_1, combined_smiles_2, combined_smiles_3]))

# Assuming your DataFrame has the proper column names
df['Combined_SMILES'] = df.apply(combine_row_smiles, axis=1)

# Check the resulting DataFrame
print(df[['Combined_SMILES']])

# # Function to convert a SMILES string to a vector embedding
# def smiles_to_embedding(smiles, model, tokenizer):
#     # Tokenize the SMILES string
#     inputs = tokenizer(smiles, return_tensors="pt", padding=True, truncation=True)
    
#     # Pass the tokens through the model
#     with torch.no_grad():
#         outputs = model(**inputs)
    
#     # Get the embeddings (last hidden state)
#     embeddings = outputs.last_hidden_state
    
#     # You can either use the embedding of the [CLS] token or average all token embeddings
#     cls_embedding = embeddings[:, 0, :]  # [CLS] token embedding
#     avg_embedding = embeddings.mean(dim=1)  # Average embedding of all tokens
    
#     return cls_embedding, avg_embedding

# # Generate embeddings for the combined SMILES strings
# embeddings_cls = []
# embeddings_avg = []

# for smiles in df['Combined_SMILES']:
#     cls_embedding, avg_embedding = smiles_to_embedding(smiles, model, tokenizer)
#     embeddings_cls.append(cls_embedding)
#     embeddings_avg.append(avg_embedding)

# # Convert embeddings into numpy arrays for easier manipulation
# embeddings_cls = torch.cat(embeddings_cls, dim=0).numpy()
# embeddings_avg = torch.cat(embeddings_avg, dim=0).numpy()

# print("CLS Embeddings:", embeddings_cls)
# print("Average Embeddings:", embeddings_avg)

# # Optional: You can store the embeddings back in the DataFrame if you want to keep them for future use
# df['Embedding_CLS'] = embeddings_cls.tolist()
# df['Embedding_Avg'] = embeddings_avg.tolist()

# # Save to a new CSV file (optional)
# df.to_csv('materials_with_embeddings.csv', index=False)


      Combined_SMILES
0                    
1                    
2                    
3                    
4                    
...               ...
39965                
39966                
39967                
39968                
39969                

[39970 rows x 1 columns]
