In [None]:
import pandas as pd

# Define the path to your CSV file
file_path = 'perovskite_database_query.csv'  # Replace with the path to your original dataset

# Define the columns to keep
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Read the original CSV file
data = pd.read_csv(file_path)

# Filter columns to keep only those that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in data.columns]
filtered_data = data[existing_columns]

# Keep only rows where 'Cell_architecture' is exactly 'nip'
filtered_data = filtered_data[filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']

# Add an index column
filtered_data.reset_index(inplace=True, drop=True)
filtered_data.index.name = 'Index'
filtered_data.reset_index(inplace=True)

# Save the filtered dataset to a new CSV file
output_path = 'filtered_DatabaseMaterials_with_index.csv'  # Specify your desired output path
filtered_data.to_csv(output_path, index=False)

# Get the list of columns and number of columns
columns = filtered_data.columns.tolist()
num_columns = len(columns)

print("Filtered dataset with index saved as", output_path)
print("Number of columns:", num_columns)
print("Columns:", columns)



  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

In [19]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a list of materials in the specified order.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get materials in each layer
        materials_sequence = [layer.strip() for layer in cell_stack_sequence.split(' | ')]
        
        return materials_sequence
    
    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row
row_name = 0  # Example row index
materials_list = retriever.get_materials_sequence(row_name)
print("Materials in row", row_name, ":", materials_list)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

In [18]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a dictionary where each layer is labeled with its role (e.g., 'Substrate', 'ETL').
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get materials in each layer
        materials_sequence = [layer.strip() for layer in cell_stack_sequence.split(' | ')]

        # Define layer roles based on assumed positions
        layer_labels = ['Substrate', 'ETL', 'Perovskite', 'HTL', 'Backcontact']  # Adjust as needed
        labeled_sequence = {layer_labels[i]: materials_sequence[i] for i in range(len(materials_sequence))}

        return labeled_sequence
    
    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row with labeled layers
row_name = 0  # Example row index
materials_dict = retriever.get_materials_sequence_with_layers(row_name)
print("Labeled materials in row", row_name, ":", materials_dict)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

IndexError: list index out of range

In [20]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = cell_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        materials_sequence_2d = [layer.split('; ') for layer in layers]
        
        return materials_sequence_2d
    
    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row with labeled layers
row_name = 0  # Example row index
materials_2d_list = retriever.get_materials_sequence_with_layers(row_name)
print("Materials in row", row_name, "by layer:", materials_2d_list)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

In [None]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = cell_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        materials_sequence_2d = [layer.split('; ') for layer in layers]
        
        return materials_sequence_2d

    def get_substrate_materials_sequence(self, row_name):
        """
        Given a row name (index), retrieve the substrate stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the substrate stack sequence for the specified row
        substrate_stack_sequence = self.filtered_data.loc[row_name, 'Substrate_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = substrate_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        substrate_sequence_2d = [layer.split('; ') for layer in layers]
        
        return substrate_sequence_2d

    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row in both Cell_stack_sequence and Substrate_stack_sequence
row_name = 0  # Example row index
materials_2d_list = retriever.get_materials_sequence_with_layers(row_name)
substrate_2d_list = retriever.get_substrate_materials_sequence(row_name)
print("Materials in row", row_name, "by layer in Cell_stack_sequence:", materials_2d_list)
print("Materials in row", row_name, "by layer in Substrate_stack_sequence:", substrate_2d_list)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

In [None]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = cell_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        materials_sequence_2d = [layer.split('; ') for layer in layers]
        
        return materials_sequence_2d

    def get_substrate_materials_sequence(self, row_name):
        """
        Given a row name (index), retrieve the substrate stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the substrate stack sequence for the specified row
        substrate_stack_sequence = self.filtered_data.loc[row_name, 'Substrate_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = substrate_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        substrate_sequence_2d = [layer.split('; ') for layer in layers]
        
        return substrate_sequence_2d

    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row in both Cell_stack_sequence and Substrate_stack_sequence
row_name = 0  # Example row index
materials_2d_list = retriever.get_materials_sequence_with_layers(row_name)
substrate_2d_list = retriever.get_substrate_materials_sequence(row_name)
print("Materials in row", row_name, "by layer in Cell_stack_sequence:", materials_2d_list)
print("Materials in row", row_name, "by layer in Substrate_stack_sequence:", substrate_2d_list)


In [1]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        """
        Given a row name (index), retrieve the cell stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the cell stack sequence for the specified row
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = cell_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        materials_sequence_2d = [layer.split('; ') for layer in layers]
        
        return materials_sequence_2d

    def get_substrate_materials_sequence(self, row_name):
        """
        Given a row name (index), retrieve the substrate stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the substrate stack sequence for the specified row
        substrate_stack_sequence = self.filtered_data.loc[row_name, 'Substrate_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = substrate_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        substrate_sequence_2d = [layer.split('; ') for layer in layers]
        
        return substrate_sequence_2d

    def get_etl_materials_sequence(self, row_name):
        """
        Given a row name (index), retrieve the ETL stack sequence and return 
        a 2D array where each cell represents a different layer, and each sub-array 
        contains the materials within that layer.
        """
        # Ensure the row_name exists in the dataset's index
        if row_name not in self.filtered_data.index:
            return f"Row '{row_name}' not found in the dataset."
        
        # Retrieve the ETL stack sequence for the specified row
        etl_stack_sequence = self.filtered_data.loc[row_name, 'ETL_stack_sequence']
        
        # Split the sequence by the ' | ' separator to get each layer
        layers = etl_stack_sequence.split(' | ')
        
        # Split each layer by ';' to get individual materials in a 2D array
        etl_sequence_2d = [layer.split('; ') for layer in layers]
        
        return etl_sequence_2d

    def save_filtered_data(self, output_path):
        """
        Save the filtered dataset with index to a CSV file.
        """
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)
        
        # Display column information
        columns = self.filtered_data.columns.tolist()
        num_columns = len(columns)
        print("Number of columns:", num_columns)
        print("Columns:", columns)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row in Cell_stack_sequence, Substrate_stack_sequence, and ETL_stack_sequence
row_name = 0  # Example row index
materials_2d_list = retriever.get_materials_sequence_with_layers(row_name)
substrate_2d_list = retriever.get_substrate_materials_sequence(row_name)
etl_2d_list = retriever.get_etl_materials_sequence(row_name)

print("Materials in row", row_name, "by layer in Cell_stack_sequence:", materials_2d_list)
print("Materials in row", row_name, "by layer in Substrate_stack_sequence:", substrate_2d_list)
print("Materials in row", row_name, "by layer in ETL_stack_sequence:", etl_2d_list)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Number of columns: 39
Columns: ['Index', 'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness', 'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations', 'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness', 'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations', 'Backcontact_stack_sequence', 'Backcontact_additives_compounds', 'Backcontact_additives_concentrations', 'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 'Add_lay_front_additives_compounds', '

In [2]:
import pandas as pd

class CellStackSequenceRetriever:
    def __init__(self, file_path, columns_to_keep):
        # Load the dataset and filter columns
        data = pd.read_csv(file_path)
        existing_columns = [col for col in columns_to_keep if col in data.columns]
        self.filtered_data = data[existing_columns]
        
        # Keep only rows where 'Cell_architecture' is exactly 'nip'
        self.filtered_data = self.filtered_data[self.filtered_data['Cell_architecture'].str.strip().str.lower() == 'nip']
        
        # Add an index column
        self.filtered_data.reset_index(inplace=True, drop=True)
        self.filtered_data.index.name = 'Index'
        self.filtered_data.reset_index(inplace=True)

    def get_materials_sequence_with_layers(self, row_name):
        cell_stack_sequence = self.filtered_data.loc[row_name, 'Cell_stack_sequence']
        return [layer.split('; ') for layer in cell_stack_sequence.split(' | ')]
    
    def get_substrate_materials_sequence(self, row_name):
        substrate_stack_sequence = self.filtered_data.loc[row_name, 'Substrate_stack_sequence']
        return [layer.split('; ') for layer in substrate_stack_sequence.split(' | ')]

    def get_etl_materials_sequence(self, row_name):
        etl_stack_sequence = self.filtered_data.loc[row_name, 'ETL_stack_sequence']
        return [layer.split('; ') for layer in etl_stack_sequence.split(' | ')]

    def get_htl_materials_sequence(self, row_name):
        htl_stack_sequence = self.filtered_data.loc[row_name, 'HTL_stack_sequence']
        return [layer.split('; ') for layer in htl_stack_sequence.split(' | ')]

    def get_backcontact_materials_sequence(self, row_name):
        backcontact_stack_sequence = self.filtered_data.loc[row_name, 'Backcontact_stack_sequence']
        return [layer.split('; ') for layer in backcontact_stack_sequence.split(' | ')]

    def get_add_lay_back_materials_sequence(self, row_name):
        add_lay_back_stack_sequence = self.filtered_data.loc[row_name, 'Add_lay_back_stack_sequence']
        return [layer.split('; ') for layer in add_lay_back_stack_sequence.split(' | ')]

    def get_encapsulation_materials_sequence(self, row_name):
        encapsulation_stack_sequence = self.filtered_data.loc[row_name, 'Encapsulation_stack_sequence']
        return [layer.split('; ') for layer in encapsulation_stack_sequence.split(' | ')]

    def save_filtered_data(self, output_path):
        self.filtered_data.to_csv(output_path, index=False)
        print("Filtered dataset with index saved as", output_path)

# Define the path to your CSV file and columns to keep
file_path = 'perovskite_database_query.csv'
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture', 'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Instantiate the retriever and filter data
retriever = CellStackSequenceRetriever(file_path, columns_to_keep)

# Save the filtered dataset
output_path = 'filtered_DatabaseMaterials_with_index.csv'
retriever.save_filtered_data(output_path)

# Example: Retrieve materials sequence for a specific row across multiple stack sequences
row_name = 0  # Example row index
materials_2d_list = retriever.get_materials_sequence_with_layers(row_name)
substrate_2d_list = retriever.get_substrate_materials_sequence(row_name)
etl_2d_list = retriever.get_etl_materials_sequence(row_name)
htl_2d_list = retriever.get_htl_materials_sequence(row_name)
backcontact_2d_list = retriever.get_backcontact_materials_sequence(row_name)
add_lay_back_2d_list = retriever.get_add_lay_back_materials_sequence(row_name)
encapsulation_2d_list = retriever.get_encapsulation_materials_sequence(row_name)

print("Materials in row", row_name, "by layer in Cell_stack_sequence:", materials_2d_list)
print("Materials in row", row_name, "by layer in Substrate_stack_sequence:", substrate_2d_list)
print("Materials in row", row_name, "by layer in ETL_stack_sequence:", etl_2d_list)
print("Materials in row", row_name, "by layer in HTL_stack_sequence:", htl_2d_list)
print("Materials in row", row_name, "by layer in Backcontact_stack_sequence:", backcontact_2d_list)
print("Materials in row", row_name, "by layer in Add_lay_back_stack_sequence:", add_lay_back_2d_list)
print("Materials in row", row_name, "by layer in Encapsulation_stack_sequence:", encapsulation_2d_list)


  data = pd.read_csv(file_path)


Filtered dataset with index saved as filtered_DatabaseMaterials_with_index.csv
Materials in row 0 by layer in Cell_stack_sequence: [['SLG'], ['FTO'], ['TiO2-c'], ['TiO2-mp'], ['Perovskite'], ['Spiro-MeOTAD'], ['Au']]
Materials in row 0 by layer in Substrate_stack_sequence: [['SLG'], ['FTO']]
Materials in row 0 by layer in ETL_stack_sequence: [['TiO2-c'], ['TiO2-mp']]
Materials in row 0 by layer in HTL_stack_sequence: [['Spiro-MeOTAD']]
Materials in row 0 by layer in Backcontact_stack_sequence: [['Au']]
Materials in row 0 by layer in Add_lay_back_stack_sequence: [['Unknown']]
Materials in row 0 by layer in Encapsulation_stack_sequence: [['Unknown']]
