Perovskite modelling program - returning PSC stack from an input row (from perovskite database)

### Data Preprocessing 
1. Initital cleaning of ions and coefficients
2. Include 'combined' ions, coefficients and sites columns for vector embedding

In [None]:

import pandas as pd
import re

# Load the CSV file
file_path = r"C:\Users\c\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Data\Perovsite database query.csv"
data = pd.read_csv(file_path)

# Define the columns to keep
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Filter columns to keep only those that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in data.columns]
data = data[existing_columns]

data.columns = data.columns.str.strip()

# # Add an index column
# data.reset_index(inplace=True)
# data.rename(columns={'index': 'Index'}, inplace=True)

# # Save the filtered dataset to a new CSV file
# output_path = 'filtered_DatabaseMaterials_with_index.csv'
# data.to_csv(output_path, index=False)
# print("Filtered dataset with index saved as", output_path)

# Create a dataframe for ions and their coefficients
ion_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients'
]

ion_data = data[ion_columns]

# # Save the unchanged ion data
# output_path = 'ion_data_unchanged.csv'
# ion_data.to_csv(output_path, index=False)
# print("Unchanged ion data saved as", output_path)

# Function to clean molecule names
def clean_molecule_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s\-()]+', ' ', name.strip())
    name = re.sub(r'\s+', ' ', name).strip()
    elements = [element for element in name.split() if element and not element.replace('.', '', 1).isdigit()]
    return elements

# Function to clean and convert coefficients to floats
def clean_and_convert_coefficient(coefficient):
    try:
        cleaned_coefficient = re.sub(r'[^0-9.eE-]', '', coefficient.replace(',', '').strip())
        return float(cleaned_coefficient) if cleaned_coefficient else 0.0
    except ValueError:
        return 0.0

# Function to normalize coefficients
def normalize_coefficients(cell):
    if pd.notna(cell):
        try:
            coefficients = [float(x.strip()) for x in re.split(r'[;|]', cell) if x.strip()]
            total_sum = sum(coefficients)
            return ';'.join(f"{val / total_sum:.3f}" for val in coefficients) if total_sum > 0 else cell
        except ValueError:
            return cell
    return cell

# Normalize coefficients in each column
coefficient_columns = [
    'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions_coefficients', 
    'Perovskite_composition_c_ions_coefficients'
]

for col in coefficient_columns:
    data[col] = data[col].apply(normalize_coefficients)

# Create a set of unique molecules and add new columns
unique_molecules = set()
for column_group in ['a', 'b', 'c']:
    ions_column = f'Perovskite_composition_{column_group}_ions'
    coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
    for _, row in data.iterrows():
        ions, _ = clean_molecule_name(str(row[ions_column])), [clean_and_convert_coefficient(c) for c in str(row[coefficients_column]).split(';')]
        unique_molecules.update(ions)

# Create columns for each unique molecule and calculate proportions
for molecule in unique_molecules:
    data[molecule] = 0.0

for index, row in data[coefficient_columns].iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row[coefficients_column]).split(';')]
        total_coeff = sum(coefficients) if sum(coefficients) != 0 else 1
        
        for ion, coeff in zip(ions, coefficients):
            data.at[index, ion] += coeff / total_coeff


# Create a new column 'Layer_Type' to indicate if the row is multilayered or single-layered

data['Layer Type'] = data.apply(
    lambda row: 'Multi-layered Perovskite' if any('|' in str(row[col]) for col in ion_columns) else 'Single-layered Perovskite',
    axis=1
)

# Add/append columns for 'combined ions' and 'combined coefficients' - vector embedding

data['combined_ions'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions', '')},{row.get('Perovskite_composition_b_ions', '')},{row.get('Perovskite_composition_c_ions', '')}", 
    axis=1
)

data['combined_coefficients'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions_coefficients', '')},{row.get('Perovskite_composition_b_ions_coefficients', '')},{row.get('Perovskite_composition_c_ions_coefficients', '')}", 
    axis=1
)

# ### make sure all combined ions and combined coefficients are lists

import re

# Function to convert a string with mixed delimiters to a list
def convert_to_list(entry):
    if isinstance(entry, str):
        # Replace semicolons and pipes with commas for uniformity
        entry = re.sub(r'[;|]', ',', entry)
        # Split the string by commas and strip spaces around each item
        return [item.strip() for item in entry.split(',') if item.strip()]
    elif isinstance(entry, list):
        return entry  # Already a list, no action needed
    else:
        return []  # Handle missing or invalid entries

# Function to convert string entries to float and handle non-numeric values
def safe_convert_to_float(entry):
    try:
        return float(entry)  # Attempt to convert to float
    except ValueError:
        return None  # If conversion fails, return None (or handle as needed)


### add an indication of perovskite site

# Add the 'combined_sites' column
def generate_combined_sites(row):
    # Split combined ions and coefficients into lists
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    
    # Assign sites ('a', 'b', 'c') based on the origin of each ion/coefficient
    sites = []
    site_labels = ['a', 'b', 'c']
    for site, ions_col, coeff_col in zip(site_labels, 
                                         ['Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 'Perovskite_composition_c_ions'], 
                                         ['Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions_coefficients']):
        # Count the number of ions and coefficients from this site
        num_ions = len(clean_molecule_name(str(row.get(ions_col, ""))))
        num_coefficients = len(str(row.get(coeff_col, "")).split(';'))
        
        # Append the site label for each ion/coefficient from this site
        sites.extend([site] * max(num_ions, num_coefficients))
    
    return sites

# Apply the function to generate the 'combined_sites' column
data['combined_sites'] = data.apply(generate_combined_sites, axis=1)

def clean_coefficients(coefficients):
    """
    Cleans the coefficients by ensuring all values are numeric.
    Invalid or non-numeric values are replaced with 0.0.
    If the value is already a float, it is left unchanged.
    """
    cleaned = []
    for c in coefficients:
        if isinstance(c, float):  # If already a float, keep it as is
            cleaned.append(c)
        elif isinstance(c, str) and c.replace('.', '', 1).isdigit():  # If a valid string representation of a number
            cleaned.append(float(c))
        else:  # For invalid entries
            cleaned.append(0.0)
    return cleaned

def normalize_coefficients_within_cell(row):
    """
    Normalizes the coefficients for each site ('a', 'b', 'c') within a cell.
    Ensures that the sum of coefficients for each site equals 1.
    """
    # Extract ions, coefficients, and sites for the row
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = row['combined_sites']
    
    # Initialize lists for each site
    site_a_coeffs = []
    site_b_coeffs = []
    site_c_coeffs = []
    
    # Separate the coefficients by their sites
    for coeff, site in zip(coefficients, sites):
        try:
            coeff = float(coeff)  # Ensure coefficients are numeric
        except ValueError:
            coeff = 0.0  # Default to 0.0 if invalid
        if site == 'a':
            site_a_coeffs.append(coeff)
        elif site == 'b':
            site_b_coeffs.append(coeff)
        elif site == 'c':
            site_c_coeffs.append(coeff)
    
    # Normalize the coefficients for each site if their sum is not zero
    def normalize(site_coeffs):
        total = sum(site_coeffs)
        return [coeff / total for coeff in site_coeffs] if total > 0 else site_coeffs
    
    site_a_coeffs = normalize(site_a_coeffs)
    site_b_coeffs = normalize(site_b_coeffs)
    site_c_coeffs = normalize(site_c_coeffs)
    
    # Combine all coefficients back into a single list
    normalized_coeffs = site_a_coeffs + site_b_coeffs + site_c_coeffs

    return normalized_coeffs



# Apply the function to both columns
data['combined_ions'] = data['combined_ions'].apply(convert_to_list)
data['combined_coefficients'] = data['combined_coefficients'].apply(
    lambda x: [safe_convert_to_float(item) for item in convert_to_list(x)]  # Convert to float for coefficients, handle errors
)

# Step 1: Clean the coefficients column
data['combined_coefficients'] = data['combined_coefficients'].apply(clean_coefficients)


# Step 2: Normalize coefficients within each cell
data['combined_coefficients'] = data.apply(normalize_coefficients_within_cell, axis=1)
data['combined_coefficients'] = data['combined_coefficients'].apply(
    lambda x: [float(coeff) if str(coeff).replace('.', '', 1).isdigit() else 0.0 for coeff in x] if isinstance(x, list) else []
)

# Verify the transformation
print(data[['combined_ions', 'combined_coefficients', 'combined_sites']].head())


# Drop the original ion columns
data = data.drop(columns=ion_columns, errors='ignore')

# Save the modified DataFrame with the 'Layer Type' and combined columns
output_file_path = 'data_with_layer_type_and_combined.csv'
data.to_csv(output_file_path, index=False)
print("CSV file with layer type information modified and saved as:", output_file_path)


Check 'combined' columns:

In [None]:
def check_site_coefficients(data):
    """
    Checks if the coefficients for a, b, and c sites in each cell add up to 1.
    Prints the result for each cell and returns a DataFrame indicating validity.
    """
    # Initialize a results list
    results = []

    for idx, row in data.iterrows():
        # Get sites and coefficients for the current row
        sites = row['combined_sites']
        coefficients = row['combined_coefficients']

        # Group coefficients by site
        site_a_coeffs = [coeff for coeff, site in zip(coefficients, sites) if site == 'a']
        site_b_coeffs = [coeff for coeff, site in zip(coefficients, sites) if site == 'b']
        site_c_coeffs = [coeff for coeff, site in zip(coefficients, sites) if site == 'c']

        # Calculate sums
        sum_a = sum(site_a_coeffs)
        sum_b = sum(site_b_coeffs)
        sum_c = sum(site_c_coeffs)

        # Check if sums are close to 1
        valid_a = abs(sum_a - 1) < 1e-6  # Allow for floating-point tolerance
        valid_b = abs(sum_b - 1) < 1e-6
        valid_c = abs(sum_c - 1) < 1e-6

        # Append result for this row
        results.append({
            'row_index': idx,
            'valid_a': valid_a,
            'valid_b': valid_b,
            'valid_c': valid_c,
            'sum_a': sum_a,
            'sum_b': sum_b,
            'sum_c': sum_c,
        })

    # Convert results to a DataFrame for easy inspection
    results_df = pd.DataFrame(results)
    return results_df

# Run the check
site_check_results = check_site_coefficients(data)

# Inspect invalid rows
invalid_rows = site_check_results[~(site_check_results['valid_a'] & site_check_results['valid_b'] & site_check_results['valid_c'])]
print("Invalid rows where site coefficients do not add to 1:")
print(invalid_rows)


Return all information on a specific perovskite

In [13]:
# for a given row / perovskite, return all non-zero element and coefficient entries

# Function to get non-zero cells for a specific row
def get_non_zero_cells(row_number):
    # Check if the row_number is valid
    if row_number < 0 or row_number >= len(data):
        return "Invalid row number"
    
    # Get the specified row
    row = data.iloc[row_number]

    # Find non-zero (non-empty) entries and their column names
    non_zero_cells = {col: value for col, value in row.items() if value != 0 and value != ''}

    return non_zero_cells

# Example usage
row_number = 1234 # Replace with the row number you want to check
result = get_non_zero_cells(row_number)
print(f"Non-zero entries in row {row_number}: {result}")

Non-zero entries in row 1234: {'Cell_stack_sequence': 'SLG | FTO | TiO2-c | TiO2-mp | Perovskite | Spiro-MeOTAD | Au', 'Cell_architecture': 'nip', 'Substrate_stack_sequence': 'SLG | FTO', 'Substrate_thickness': nan, 'ETL_stack_sequence': 'TiO2-c | TiO2-mp', 'ETL_thickness': '20.0 | 150.0', 'ETL_additives_compounds': 'Li-TFSI; Mg(TFSI)2', 'ETL_additives_concentrations': nan, 'Perovskite_additives_compounds': 'KI', 'Perovskite_additives_concentrations': nan, 'Perovskite_thickness': 500.0, 'HTL_stack_sequence': 'Spiro-MeOTAD', 'HTL_thickness_list': nan, 'HTL_additives_compounds': 'Co; Li-TFSI; TBP', 'HTL_additives_concentrations': nan, 'Backcontact_stack_sequence': 'Au', 'Backcontact_additives_compounds': nan, 'Backcontact_additives_concentrations': nan, 'Add_lay_front_function': nan, 'Add_lay_front_stack_sequence': 'Unknown', 'Add_lay_front_thickness_list': nan, 'Add_lay_front_additives_compounds': nan, 'Add_lay_front_additives_concentrations': nan, 'Add_lay_back_function': nan, 'Add_lay

### Feature Importance

Vector embeddings - ions, coefficients and sites

In [None]:
# combined ions, coefficients and site column entries must all be lists

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

# Map ions and sites to indices
ion_vocab = {ion: i for i, ion in enumerate(set(','.join(data['combined_ions']).split(',')))}
site_vocab = {'a': 0, 'b': 1, 'c': 2}

# Prepare data tensors
class PerovskiteDataset(Dataset):
    def __init__(self, data):
        self.ions = [torch.tensor([ion_vocab[ion] for ion in row.split(',')]) for row in data['combined_ions']]
        self.coefficients = [torch.tensor(list(map(float, row.split(',')))) for row in data['combined_coefficients']]
        self.sites = [torch.tensor([site_vocab[site] for site in row]) for row in data['combined_sites']]
    
    def __len__(self):
        return len(self.ions)

    def __getitem__(self, idx):
        return self.ions[idx], self.coefficients[idx], self.sites[idx]

dataset = PerovskiteDataset(data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define embeddings and a simple network
ion_embedding = nn.Embedding(len(ion_vocab), 50)
site_embedding = nn.Embedding(len(site_vocab), 10)

class PerovskiteModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.ion_embedding = ion_embedding
        self.site_embedding = site_embedding
        self.fc = nn.Sequential(
            nn.Linear(50 + 10 + 1, 64),
            nn.ReLU(),
            nn.Linear(64, 3)  # Example output size
        )
    
    def forward(self, ions, coefficients, sites):
        ion_embeds = self.ion_embedding(ions).mean(dim=1)  # Mean embedding
        site_embeds = self.site_embedding(sites).mean(dim=1)
        coeff_tensor = coefficients.mean(dim=1).unsqueeze(1)  # Aggregate coefficients
        x = torch.cat((ion_embeds, site_embeds, coeff_tensor), dim=1)
        return self.fc(x)

model = PerovskiteModel()

data['row_vector'] = data.apply(
    lambda row: np.concatenate([row['ions_embedding'], row['coefficients_vector'], row['sites_embedding']]),
    axis=1
)

# Example: Dimensionality reduction for visualization
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Stack all row vectors into a matrix
embedding_matrix = np.vstack(data['row_vector'])

# Apply PCA
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(embedding_matrix)

# Scatter plot
plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1])
plt.title("Row Vectors Visualization")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


ValueError: could not convert string to float: '0.100;0.900'

Hussain paper replication - formatting

In [7]:
# Filter the unique molecules set to keep only the specified molecules
molecules_to_keep = {'I', 'Br', 'Pb', 'Sn', 'Cl', 'FA', 'MA', 'Cs'}
filtered_unique_molecules = unique_molecules.intersection(molecules_to_keep)

# Update the output to show the filtered unique molecules
print("Filtered Unique molecules identified:", filtered_unique_molecules)

# Drop all columns in data_cleaned that are not in molecules_to_keep
columns_to_drop = [col for col in ion_data.columns if col not in molecules_to_keep and col not in columns_to_keep]
ion_data.drop(columns=columns_to_drop, inplace=True)

# Update the output to show the remaining columns
print("Remaining columns after dropping:", ion_data.columns)

# Save the modified dataframe to a new CSV
output_file_path = 'hussain_molecules_file.csv'
ion_data.to_csv(output_file_path, index=False)

print("CSV file modified and saved as:", output_file_path)

Filtered Unique molecules identified: {'Cl', 'Pb', 'MA', 'Sn', 'FA', 'I', 'Cs', 'Br'}
Remaining columns after dropping: Index(['Sn', 'Cs', 'MA', 'I', 'Cl', 'Br', 'Pb', 'FA'], dtype='object')
CSV file modified and saved as: hussain_molecules_file.csv


Re-format data into a dataframe we can easily vectorise to enable embedding:

In [4]:
## for each row - the code should be able to return values split into 3 different columns: a list of elements in the crystal, a list of their respective coefficients and sites
# eg. for row 123: [MA, I, Pb, Br], [1,1,0.5,0.5], [a,b,c,c]

# use original dataset

df = pd.read_csv(r"C:\Users\c\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Data\Perovsite database query.csv")

# replace all ; and | in ion columns data with ,
# Replace all occurrences of ';' and '|' with ',' in the ion columns, keeping original if not found

df[ion_columns] = df[ion_columns].map(
    lambda x: str(x).replace(';', ',').replace('|', ',') if pd.notna(x) else x
)

# Continue with other processing or save the DataFrame as needed

# print(df[ion_columns].head())


### create sites column

# Function to extract the site from the column name
def get_site(column_name):
    # Extract the site part from the column name
    site = column_name.split('_')[3]  # 'a', 'b', or 'c' will be at index 3
    return site

# # Create a new 'site' column for ions and coefficients
# # Loop through ion columns and assign the site based on the column name
# ion_columns = [col for col in df.columns if 'Perovskite_composition' in col and 'ions' in col]
# coefficient_columns = [col for col in df.columns if 'Perovskite_composition' in col and 'coefficients' in col]


# # For each ion column, create a new site column
# site_mapping = {}

# for ion_col, coeff_col in zip(ion_columns, coefficient_columns):
#     site = get_site(ion_col)  # Extract site from ion column
#     site_mapping[ion_col] = site
#     site_mapping[coeff_col] = site

# # Now apply the new site to a new 'site' column
# df['site'] = [site_mapping[col] for col in df[ion_columns].columns if 'ions' in col]

# print(df['site'])

# combine a b and c ions and coefficients in 1 column 

df['combined_ions'] = df.apply(lambda row: f"{row['Perovskite_composition_a_ions']},{row['Perovskite_composition_b_ions']},{row['Perovskite_composition_c_ions']}", axis=1)
df['combined_coefficients'] = df.apply(lambda row: f"{row['Perovskite_composition_a_ions_coefficients']},{row['Perovskite_composition_b_ions_coefficients']},{row['Perovskite_composition_c_ions_coefficients']}", axis=1)

print(df[['combined_ions', 'combined_coefficients']].head())

# Save the updated DataFrame to a CSV file
output_file = "combined_ions_dataset.csv"
df.to_csv(output_file, index=False)

print(f"DataFrame with combined columns saved to {output_file}")


  df = pd.read_csv(r"C:\Users\c\Documents\PEROVSKITE PROJECT\PerovskiteML_project\Data\Perovsite database query.csv")


  combined_ions combined_coefficients
0       Cs,Sn,I                 1,1,3
1   Cs,Sn,Br, I          1,1,0.3, 2.7
2   Cs,Sn,Br, I          1,1,1.5, 1.5
3   Cs,Sn,Br, I          1,1,2.7, 0.3
4      Cs,Sn,Br                 1,1,3
DataFrame with combined columns saved to combined_ions_dataset.csv


In [160]:

# Define the site mapping based on original column names
site_mapping = {
    'Perovskite_composition_a_ions': 'a',
    'Perovskite_composition_b_ions': 'b',
    'Perovskite_composition_c_ions': 'c',
    'Perovskite_composition_a_ions_coefficients': 'a',
    'Perovskite_composition_b_ions_coefficients': 'b',
    'Perovskite_composition_c_ions_coefficients': 'c'
}

# Function to assign site based on the original columns
def assign_site(row):
    # Split the ions and coefficients into lists
    ions = row['combined_ions'].split(',')
    coefficients = row['combined_coefficients'].split(',')
    
    # Create a list for sites to store the corresponding site for each ion/coeff
    sites = []
    
    # Iterate through the ions and coefficients
    for ion, coeff in zip(ions, coefficients):
        # Determine which site this ion and coefficient belong to
        if ion in df['Perovskite_composition_a_ions'].values:
            sites.append('a')
        elif ion in df['Perovskite_composition_b_ions'].values:
            sites.append('b')
        elif ion in df['Perovskite_composition_c_ions'].values:
            sites.append('c')
        else:
            sites.append('Unknown')  # If not found, mark as unknown

    # Return the sites as a comma-separated string
    return ','.join(sites)

# Apply this function to the dataframe
df['site'] = df.apply(assign_site, axis=1)

# Print the updated dataframe
print(df[['combined_ions', 'combined_coefficients', 'site']].head())


  combined_ions combined_coefficients           site
0       Cs,Sn,I                 1,1,3          a,b,c
1   Cs,Sn,Br, I          1,1,0.3, 2.7  a,b,c,Unknown
2   Cs,Sn,Br, I          1,1,1.5, 1.5  a,b,c,Unknown
3   Cs,Sn,Br, I          1,1,2.7, 0.3  a,b,c,Unknown
4      Cs,Sn,Br                 1,1,3          a,b,c
