In [4]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import product
import warnings
import csv

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    
    Parameters:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Filter rows where 'Cell_architecture' is exactly 'nip' (case-insensitive)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    
    # Reset index after filtering
    data = data.reset_index(drop=True)
    
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    
    Returns:
        dict: Dictionary mapping column names to layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    
    Parameters:
        dataframe (pd.DataFrame): The filtered DataFrame.
        layer_columns (dict): Dictionary mapping column names to layer names.
    
    Returns:
        list: Tokenized sequences (list of materials).
        dict: Mapping of materials to layers with occurrence counts.
        list: List of unique layer names.
    """
    sequences = []
    material_layer_map = {}  # Material to layers mapping with counts
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            # Split the sequence into sub-layers if applicable
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                # Split sub-layers into materials
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    
    Parameters:
        sequences (list): List of tokenized material sequences.
        vector_size (int): Dimensionality of the embeddings.
        window (int): Context window size.
        min_count (int): Minimum frequency count of materials.
        workers (int): Number of worker threads.
        sg (int): Training algorithm (1 for skip-gram; otherwise CBOW).
    
    Returns:
        Word2Vec: Trained Word2Vec model.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Assign Primary Layers to Materials
def assign_primary_layers(material_layer_map):
    """
    Assigns each material to its primary layer based on the highest occurrence.
    
    Parameters:
        material_layer_map (dict): Mapping of materials to layers with occurrence counts.
    
    Returns:
        dict: Mapping of materials to their primary layer.
    """
    material_primary_layer = {}
    for material, layers in material_layer_map.items():
        # Assign the material to the layer where it occurs most frequently
        primary_layer = max(layers, key=layers.get)
        material_primary_layer[material] = primary_layer
    return material_primary_layer

# 6. Assign Colors to Layers
def assign_colors_to_layers(layer_names):
    """
    Assigns distinct colors to each layer using a colormap.
    
    Parameters:
        layer_names (list): List of unique layer names.
    
    Returns:
        dict: Mapping of layer names to colors.
    """
    num_layers = len(layer_names)
    cmap = cm.get_cmap('tab10', num_layers) if num_layers <= 10 else cm.get_cmap('tab20', num_layers)
    
    layer_colors = {}
    for idx, layer_name in enumerate(layer_names):
        layer_colors[layer_name] = cmap(idx)
    return layer_colors

# 7. Extract Embeddings
def extract_embeddings(model, materials):
    """
    Extracts embeddings for each material from the Word2Vec model.
    
    Parameters:
        model (Word2Vec): Trained Word2Vec model.
        materials (list): List of materials.
    
    Returns:
        np.ndarray: Array of embeddings.
    """
    embeddings = np.array([model.wv[material] for material in materials])
    return embeddings

# 8. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    
    Parameters:
        sequences (list): List of tokenized material sequences for each sample.
        model (Word2Vec): Trained Word2Vec model.
        vector_size (int): Dimensionality of the embeddings.
    
    Returns:
        np.ndarray: Aggregated feature matrix.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 9. Plot Embeddings with Color Coding and Labels
def plot_embeddings_colored(embeddings_2d, materials, material_primary_layer, layer_colors, title, annotate=True):
    """
    Plots the 2D embeddings with colors based on their primary layers and labels each vector.
    
    Parameters:
        embeddings_2d (np.ndarray): 2D embeddings.
        materials (list): List of materials.
        material_primary_layer (dict): Mapping of materials to their primary layers.
        layer_colors (dict): Mapping of layer names to colors.
        title (str): Title of the plot.
        annotate (bool): Whether to annotate material names on the plot.
    """
    plt.figure(figsize=(14, 12))
    
    # Assign colors to each material based on its primary layer
    colors_list = [layer_colors[material_primary_layer[material]] for material in materials]
    
    # Create scatter plot
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors_list, alpha=0.7, edgecolors='w', linewidth=0.5)
    
    # Optionally annotate materials
    if annotate:
        for i, material in enumerate(materials):
            plt.annotate(material, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]),
                         fontsize=8, alpha=0.75, ha='right', va='bottom')
    
    # Create legend
    legend_handles = []
    for layer_name, color in layer_colors.items():
        patch = mpatches.Patch(color=color, label=layer_name)
        legend_handles.append(patch)
    plt.legend(handles=legend_handles, title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.title(title, fontsize=16)
    plt.xlabel('Dimension 1', fontsize=14)
    plt.ylabel('Dimension 2', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

# 10. Prepare Feature Matrix and Target Vector
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    
    Parameters:
        aggregated_features (np.ndarray): Aggregated feature matrix.
        dataframe (pd.DataFrame): Original DataFrame.
        target_column (str): Name of the target column.
    
    Returns:
        np.ndarray: Feature matrix.
        np.ndarray: Target vector.
    """
    X = aggregated_features
    y = dataframe[target_column].values
    return X, y

# 11. Define Models and Hyperparameters
def define_models_hyperparameters():
    """
    Defines the models and their corresponding hyperparameters for optimization.
    
    Returns:
        dict: Dictionary mapping model names to their scikit-learn estimator and hyperparameter grid.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5],
                'min_samples_split': [2, 5]
            }
        },
        'SVR': {
            'model': SVR(),
            'params': {
                'C': [1, 10],
                'epsilon': [0.1, 0.2],
                'kernel': ['rbf', 'linear']
            }
        },
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                # Linear Regression has no hyperparameters to tune in scikit-learn
            }
        }
    }
    return models

# 12. Train and Evaluate Models
def train_evaluate_models(X, y, models, cv=5):
    """
    Trains and evaluates models with different hyperparameters using cross-validation.
    
    Parameters:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Target vector.
        models (dict): Dictionary of models and their hyperparameters.
        cv (int): Number of cross-validation folds.
    
    Returns:
        list: List of dictionaries containing model details and performance metrics.
    """
    results = []
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    for model_name, config in models.items():
        estimator = config['model']
        param_grid = config['params']
        
        # If there are no hyperparameters to tune (e.g., LinearRegression)
        if not param_grid:
            print(f"Training {model_name} with default parameters.")
            mae_list = []
            mse_list = []
            r2_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                mae_list.append(mae)
                mse_list.append(mse)
                r2_list.append(r2)
            results.append({
                'Model': model_name,
                'Parameters': 'Default',
                'MAE': np.mean(mae_list),
                'MSE': np.mean(mse_list),
                'R2': np.mean(r2_list)
            })
            continue
        
        # Generate all combinations of hyperparameters
        keys = list(param_grid.keys())
        values = list(param_grid.values())
        for combo in product(*values):
            params = dict(zip(keys, combo))
            estimator.set_params(**params)
            mae_list = []
            mse_list = []
            r2_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                mae_list.append(mae)
                mse_list.append(mse)
                r2_list.append(r2)
            results.append({
                'Model': model_name,
                'Parameters': params,
                'MAE': np.mean(mae_list),
                'MSE': np.mean(mse_list),
                'R2': np.mean(r2_list)
            })
            print(f"Trained {model_name} with parameters {params} -> MAE: {np.mean(mae_list):.4f}, MSE: {np.mean(mse_list):.4f}, R2: {np.mean(r2_list):.4f}")
    
    return results

# 13. Save Results to CSV
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    
    Parameters:
        results (list): List of dictionaries containing model details and performance metrics.
        filename (str): Name of the CSV file to save the results.
    """
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 14. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'perovskite_database_query.csv'
    
    # 1. Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # 2. Define layer columns and their corresponding layer names
    layer_columns = define_layer_columns()
    
    # 3. Parse sequences from the specified columns
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print(f"Parsed sequences from columns: {list(layer_columns.keys())}")
    
    # 4. Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # 5. Get list of unique materials
    materials = list(model.wv.index_to_key)
    print(f"Number of unique materials in ETL_stack_sequence: {len(materials)}")
    
    # 6. Assign primary layers to materials
    material_primary_layer = assign_primary_layers(material_layer_map)
    
    # 7. Assign colors to layers
    layer_colors = assign_colors_to_layers(layer_names)
    
    # 8. Extract embeddings
    embeddings = extract_embeddings(model, materials)
    
    # 9. Aggregate embeddings for each sample
    aggregated_features = aggregate_embeddings(tokenized_sequences, model, vector_size=model.vector_size)
    print("Aggregated embeddings for each sample.")
    
    # 10. Prepare feature matrix and target vector
    X, y = prepare_features_targets(aggregated_features, data, target_column='JV_default_PCE')
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}.")
    
    # 11. Define models and hyperparameters
    models = define_models_hyperparameters()
    
    # 12. Train and evaluate models
    print("Starting model training and evaluation...")
    results = train_evaluate_models(X, y, models, cv=5)
    
    # 13. Save results to CSV
    save_results_to_csv(results, filename='model_results.csv')
    
    # Optional: Save Word2Vec model for future use
    model.save("word2vec_model_ETL.stack.sequence.model")
    print("Word2Vec model saved as 'word2vec_model_ETL.stack.sequence.model'.")
    
    # Optional: Save aggregated features and target for future use
    # np.save('aggregated_features.npy', X)
    # np.save('target_vector.npy', y)
    # print("Aggregated features and target vector saved.")
    
    # 14. Visualize Embeddings (Optional)
    # PCA Visualization
    pca = PCA(n_components=2, random_state=42)
    reduced_embeddings_pca = pca.fit_transform(embeddings)
    plot_embeddings_colored(
        embeddings_2d=reduced_embeddings_pca,
        materials=materials,
        material_primary_layer=material_primary_layer,
        layer_colors=layer_colors,
        title='ETL Material Embeddings with PCA (Colored by Layers)',
        annotate=True  # Set to True to display annotations
    )
    
    # t-SNE Visualization
    tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='random', learning_rate='auto')
    reduced_embeddings_tsne = tsne.fit_transform(embeddings)
    plot_embeddings_colored(
        embeddings_2d=reduced_embeddings_tsne,
        materials=materials,
        material_primary_layer=material_primary_layer,
        layer_colors=layer_colors,
        title='ETL Material Embeddings with t-SNE (Colored by Layers)',
        annotate=True  # Set to True to display annotations
    )

if __name__ == "__main__":
    main()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/herbrowan/Library/Python/3.10/lib/python/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Users/herbrowan/Library/Python/3.10/lib/python/site-packages/traitlets/con

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [3]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import product
import warnings
import csv

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    
    Parameters:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Filter rows where 'Cell_architecture' is exactly 'nip' (case-insensitive)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    
    # Reset index after filtering
    data = data.reset_index(drop=True)
    
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    
    Returns:
        dict: Dictionary mapping column names to layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    
    Parameters:
        dataframe (pd.DataFrame): The filtered DataFrame.
        layer_columns (dict): Dictionary mapping column names to layer names.
    
    Returns:
        list: Tokenized sequences (list of materials).
        dict: Mapping of materials to layers with occurrence counts.
        list: List of unique layer names.
    """
    sequences = []
    material_layer_map = {}  # Material to layers mapping with counts
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            # Split the sequence into sub-layers if applicable
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                # Split sub-layers into materials
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    
    Parameters:
        sequences (list): List of tokenized material sequences.
        vector_size (int): Dimensionality of the embeddings.
        window (int): Context window size.
        min_count (int): Minimum frequency count of materials.
        workers (int): Number of worker threads.
        sg (int): Training algorithm (1 for skip-gram; otherwise CBOW).
    
    Returns:
        Word2Vec: Trained Word2Vec model.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Assign Primary Layers to Materials
def assign_primary_layers(material_layer_map):
    """
    Assigns each material to its primary layer based on the highest occurrence.
    
    Parameters:
        material_layer_map (dict): Mapping of materials to layers with occurrence counts.
    
    Returns:
        dict: Mapping of materials to their primary layer.
    """
    material_primary_layer = {}
    for material, layers in material_layer_map.items():
        # Assign the material to the layer where it occurs most frequently
        primary_layer = max(layers, key=layers.get)
        material_primary_layer[material] = primary_layer
    return material_primary_layer

# 6. Assign Colors to Layers
def assign_colors_to_layers(layer_names):
    """
    Assigns distinct colors to each layer using a colormap.
    
    Parameters:
        layer_names (list): List of unique layer names.
    
    Returns:
        dict: Mapping of layer names to colors.
    """
    num_layers = len(layer_names)
    cmap = cm.get_cmap('tab10', num_layers) if num_layers <= 10 else cm.get_cmap('tab20', num_layers)
    
    layer_colors = {}
    for idx, layer_name in enumerate(layer_names):
        layer_colors[layer_name] = cmap(idx)
    return layer_colors

# 7. Extract Embeddings
def extract_embeddings(model, materials):
    """
    Extracts embeddings for each material from the Word2Vec model.
    
    Parameters:
        model (Word2Vec): Trained Word2Vec model.
        materials (list): List of materials.
    
    Returns:
        np.ndarray: Array of embeddings.
    """
    embeddings = np.array([model.wv[material] for material in materials])
    return embeddings

# 8. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    
    Parameters:
        sequences (list): List of tokenized material sequences for each sample.
        model (Word2Vec): Trained Word2Vec model.
        vector_size (int): Dimensionality of the embeddings.
    
    Returns:
        np.ndarray: Aggregated feature matrix.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 9. Plot Embeddings with Color Coding and Labels
def plot_embeddings_colored(embeddings_2d, materials, material_primary_layer, layer_colors, title, annotate=True):
    """
    Plots the 2D embeddings with colors based on their primary layers and labels each vector.
    
    Parameters:
        embeddings_2d (np.ndarray): 2D embeddings.
        materials (list): List of materials.
        material_primary_layer (dict): Mapping of materials to their primary layers.
        layer_colors (dict): Mapping of layer names to colors.
        title (str): Title of the plot.
        annotate (bool): Whether to annotate material names on the plot.
    """
    plt.figure(figsize=(14, 12))
    
    # Assign colors to each material based on its primary layer
    colors_list = [layer_colors[material_primary_layer[material]] for material in materials]
    
    # Create scatter plot
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors_list, alpha=0.7, edgecolors='w', linewidth=0.5)
    
    # Optionally annotate materials
    if annotate:
        for i, material in enumerate(materials):
            plt.annotate(material, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]),
                         fontsize=8, alpha=0.75, ha='right', va='bottom')
    
    # Create legend
    legend_handles = []
    for layer_name, color in layer_colors.items():
        patch = mpatches.Patch(color=color, label=layer_name)
        legend_handles.append(patch)
    plt.legend(handles=legend_handles, title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.title(title, fontsize=16)
    plt.xlabel('Dimension 1', fontsize=14)
    plt.ylabel('Dimension 2', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

# 10. Prepare Feature Matrix and Target Vector
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Handles missing values in the target by removing corresponding samples.
    
    Parameters:
        aggregated_features (np.ndarray): Aggregated feature matrix.
        dataframe (pd.DataFrame): Original DataFrame.
        target_column (str): Name of the target column.
    
    Returns:
        np.ndarray: Feature matrix after handling missing targets.
        np.ndarray: Target vector after handling missing targets.
    """
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(aggregated_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 11. Define Models and Hyperparameters
def define_models_hyperparameters():
    """
    Defines the models and their corresponding hyperparameters for optimization.
    
    Returns:
        dict: Dictionary mapping model names to their scikit-learn estimator and hyperparameter grid.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 200],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5],
                'min_samples_split': [2, 5]
            }
        },
        'SVR': {
            'model': SVR(),
            'params': {
                'C': [1, 10],
                'epsilon': [0.1, 0.2],
                'kernel': ['rbf', 'linear']
            }
        },
        'LinearRegression': {
            'model': LinearRegression(),
            'params': {
                # Linear Regression has no hyperparameters to tune in scikit-learn
            }
        }
    }
    return models

# 12. Train and Evaluate Models
def train_evaluate_models(X, y, models, cv=5):
    """
    Trains and evaluates models with different hyperparameters using cross-validation.
    
    Parameters:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Target vector.
        models (dict): Dictionary of models and their hyperparameters.
        cv (int): Number of cross-validation folds.
    
    Returns:
        list: List of dictionaries containing model details and performance metrics.
    """
    results = []
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    for model_name, config in models.items():
        estimator = config['model']
        param_grid = config['params']
        
        # If there are no hyperparameters to tune (e.g., LinearRegression)
        if not param_grid:
            print(f"Training {model_name} with default parameters.")
            mae_list = []
            mse_list = []
            r2_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                mae_list.append(mae)
                mse_list.append(mse)
                r2_list.append(r2)
            results.append({
                'Model': model_name,
                'Parameters': 'Default',
                'MAE': np.mean(mae_list),
                'MSE': np.mean(mse_list),
                'R2': np.mean(r2_list)
            })
            print(f"{model_name} -> MAE: {np.mean(mae_list):.4f}, MSE: {np.mean(mse_list):.4f}, R2: {np.mean(r2_list):.4f}")
            continue
        
        # Generate all combinations of hyperparameters
        keys = list(param_grid.keys())
        values = list(param_grid.values())
        for combo in product(*values):
            params = dict(zip(keys, combo))
            estimator.set_params(**params)
            mae_list = []
            mse_list = []
            r2_list = []
            for train_index, test_index in kf.split(X):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                estimator.fit(X_train, y_train)
                y_pred = estimator.predict(X_test)
                mae = mean_absolute_error(y_test, y_pred)
                mse = mean_squared_error(y_test, y_pred)
                r2 = r2_score(y_test, y_pred)
                mae_list.append(mae)
                mse_list.append(mse)
                r2_list.append(r2)
            results.append({
                'Model': model_name,
                'Parameters': params,
                'MAE': np.mean(mae_list),
                'MSE': np.mean(mse_list),
                'R2': np.mean(r2_list)
            })
            print(f"Trained {model_name} with parameters {params} -> MAE: {np.mean(mae_list):.4f}, MSE: {np.mean(mse_list):.4f}, R2: {np.mean(r2_list):.4f}")
    
    return results

# 13. Save Results to CSV
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    
    Parameters:
        results (list): List of dictionaries containing model details and performance metrics.
        filename (str): Name of the CSV file to save the results.
    """
    if not results:
        print("No results to save.")
        return
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 14. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'perovskite_database_query.csv'
    
    # 1. Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # 2. Define layer columns and their corresponding layer names
    layer_columns = define_layer_columns()
    
    # 3. Parse sequences from the specified columns
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print(f"Parsed sequences from columns: {list(layer_columns.keys())}")
    
    # 4. Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # 5. Get list of unique materials
    materials = list(model.wv.index_to_key)
    print(f"Number of unique materials in specified stack sequences: {len(materials)}")
    
    # 6. Assign primary layers to materials
    material_primary_layer = assign_primary_layers(material_layer_map)
    
    # 7. Assign colors to layers
    layer_colors = assign_colors_to_layers(layer_names)
    
    # 8. Extract embeddings
    embeddings = extract_embeddings(model, materials)
    
    # 9. Aggregate embeddings for each sample
    aggregated_features = aggregate_embeddings(tokenized_sequences, model, vector_size=model.vector_size)
    print("Aggregated embeddings for each sample.")
    
    # 10. Prepare feature matrix and target vector (Handle Missing Targets)
    X, y = prepare_features_targets(aggregated_features, data, target_column='JV_default_PCE')
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}.")
    
    # 11. Define models and hyperparameters
    models = define_models_hyperparameters()
    
    # 12. Train and evaluate models
    print("Starting model training and evaluation...")
    results = train_evaluate_models(X, y, models, cv=5)
    
    # 13. Save results to CSV
    save_results_to_csv(results, filename='model_results.csv')
    
    # Optional: Save Word2Vec model for future use
    model.save("word2vec_model_full_stack_sequence.model")
    print("Word2Vec model saved as 'word2vec_model_full_stack_sequence.model'.")
    
    # Optional: Save aggregated features and target for future use
    # np.save('aggregated_features.npy', X)
    # np.save('target_vector.npy', y)
    # print("Aggregated features and target vector saved.")
    
    # 14. Visualize Embeddings (Optional)
    # PCA Visualization
    pca = PCA(n_components=2, random_state=42)
    reduced_embeddings_pca = pca.fit_transform(embeddings)
    plot_embeddings_colored(
        embeddings_2d=reduced_embeddings_pca,
        materials=materials,
        material_primary_layer=material_primary_layer,
        layer_colors=layer_colors,
        title='Material Embeddings with PCA (Colored by Layers)',
        annotate=True  # Set to True to display annotations
    )
    
    # t-SNE Visualization
    tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='random', learning_rate='auto')
    reduced_embeddings_tsne = tsne.fit_transform(embeddings)
    plot_embeddings_colored(
        embeddings_2d=reduced_embeddings_tsne,
        materials=materials,
        material_primary_layer=material_primary_layer,
        layer_colors=layer_colors,
        title='Material Embeddings with t-SNE (Colored by Layers)',
        annotate=True  # Set to True to display annotations
    )

if __name__ == "__main__":
    main()



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/Users/herbrowan/Library/Python/3.10/lib/python/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Users/herbrowan/Library/Python/3.10/lib/python/site-packages/traitlets/con

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import

In [4]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
import csv
import matplotlib.colors as mcolors
import matplotlib.cm as cm

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    
    Parameters:
        file_path (str): Path to the CSV file.
    
    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Load the dataset
    data = pd.read_csv(file_path)
    
    # Filter rows where 'Cell_architecture' is exactly 'nip' (case-insensitive)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    
    # Reset index after filtering
    data = data.reset_index(drop=True)
    
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    
    Returns:
        dict: Dictionary mapping column names to layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    
    Parameters:
        dataframe (pd.DataFrame): The filtered DataFrame.
        layer_columns (dict): Dictionary mapping column names to layer names.
    
    Returns:
        list: Tokenized sequences (list of materials).
        dict: Mapping of materials to layers with occurrence counts.
        list: List of unique layer names.
    """
    sequences = []
    material_layer_map = {}  # Material to layers mapping with counts
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            # Split the sequence into sub-layers if applicable
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                # Split sub-layers into materials
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    
    Parameters:
        sequences (list): List of tokenized material sequences.
        vector_size (int): Dimensionality of the embeddings.
        window (int): Context window size.
        min_count (int): Minimum frequency count of materials.
        workers (int): Number of worker threads.
        sg (int): Training algorithm (1 for skip-gram; otherwise CBOW).
    
    Returns:
        Word2Vec: Trained Word2Vec model.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Assign Primary Layers to Materials
def assign_primary_layers(material_layer_map):
    """
    Assigns each material to its primary layer based on the highest occurrence.
    
    Parameters:
        material_layer_map (dict): Mapping of materials to layers with occurrence counts.
    
    Returns:
        dict: Mapping of materials to their primary layer.
    """
    material_primary_layer = {}
    for material, layers in material_layer_map.items():
        # Assign the material to the layer where it occurs most frequently
        primary_layer = max(layers, key=layers.get)
        material_primary_layer[material] = primary_layer
    return material_primary_layer

# 6. Assign Colors to Layers
def assign_colors_to_layers(layer_names):
    """
    Assigns distinct colors to each layer using a colormap.
    
    Parameters:
        layer_names (list): List of unique layer names.
    
    Returns:
        dict: Mapping of layer names to colors.
    """
    num_layers = len(layer_names)
    cmap = cm.get_cmap('tab10', num_layers) if num_layers <= 10 else cm.get_cmap('tab20', num_layers)
    
    layer_colors = {}
    for idx, layer_name in enumerate(layer_names):
        layer_colors[layer_name] = cmap(idx)
    return layer_colors

# 7. Extract Embeddings
def extract_embeddings(model, materials):
    """
    Extracts embeddings for each material from the Word2Vec model.
    
    Parameters:
        model (Word2Vec): Trained Word2Vec model.
        materials (list): List of materials.
    
    Returns:
        np.ndarray: Array of embeddings.
    """
    embeddings = np.array([model.wv[material] for material in materials])
    return embeddings

# 8. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    
    Parameters:
        sequences (list): List of tokenized material sequences for each sample.
        model (Word2Vec): Trained Word2Vec model.
        vector_size (int): Dimensionality of the embeddings.
    
    Returns:
        np.ndarray: Aggregated feature matrix.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 9. Plot Embeddings with Color Coding and Labels
def plot_embeddings_colored(embeddings_2d, materials, material_primary_layer, layer_colors, title, annotate=True):
    """
    Plots the 2D embeddings with colors based on their primary layers and labels each vector.
    
    Parameters:
        embeddings_2d (np.ndarray): 2D embeddings.
        materials (list): List of materials.
        material_primary_layer (dict): Mapping of materials to their primary layers.
        layer_colors (dict): Mapping of layer names to colors.
        title (str): Title of the plot.
        annotate (bool): Whether to annotate material names on the plot.
    """
    plt.figure(figsize=(14, 12))
    
    # Assign colors to each material based on its primary layer
    colors_list = [layer_colors[material_primary_layer[material]] for material in materials]
    
    # Create scatter plot
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=colors_list, alpha=0.7, edgecolors='w', linewidth=0.5)
    
    # Optionally annotate materials
    if annotate:
        for i, material in enumerate(materials):
            plt.annotate(material, xy=(embeddings_2d[i, 0], embeddings_2d[i, 1]),
                         fontsize=8, alpha=0.75, ha='right', va='bottom')
    
    # Create legend
    legend_handles = []
    for layer_name, color in layer_colors.items():
        patch = mpatches.Patch(color=color, label=layer_name)
        legend_handles.append(patch)
    plt.legend(handles=legend_handles, title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.title(title, fontsize=16)
    plt.xlabel('Dimension 1', fontsize=14)
    plt.ylabel('Dimension 2', fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

# 10. Prepare Feature Matrix and Target Vector
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Handles missing values in the target by removing corresponding samples.
    
    Parameters:
        aggregated_features (np.ndarray): Aggregated feature matrix.
        dataframe (pd.DataFrame): Original DataFrame.
        target_column (str): Name of the target column.
    
    Returns:
        np.ndarray: Feature matrix after handling missing targets.
        np.ndarray: Target vector after handling missing targets.
    """
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(aggregated_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 11. Define Models and Hyperparameters
def define_models_hyperparameters():
    """
    Defines the models and their corresponding hyperparameters for optimization.
    
    Returns:
        dict: Dictionary mapping model names to their scikit-learn estimator and hyperparameter grid.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],          # Reduced options
                'max_depth': [None, 10],             # Fewer options
                'min_samples_split': [2, 4],         # Fewer options
                'min_samples_leaf': [1, 2]           # Fewer options
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],          # Reduced options
                'learning_rate': [0.05, 0.1],        # Fewer options
                'max_depth': [3, 4],                  # Fewer options
                'min_samples_split': [2, 4]          # Fewer options
            }
        }
        # You can add more models if time permits
    }
    return models

# 12. Train and Evaluate Models using GridSearchCV
def train_evaluate_models_with_grid_search(X, y, models, cv=3):
    """
    Trains and evaluates models using GridSearchCV for hyperparameter tuning.
    
    Parameters:
        X (np.ndarray): Feature matrix.
        y (np.ndarray): Target vector.
        models (dict): Dictionary of models and their hyperparameters.
        cv (int): Number of cross-validation folds.
    
    Returns:
        list: List of dictionaries containing model details and performance metrics.
    """
    results = []
    
    for model_name, config in models.items():
        estimator = config['model']
        param_grid = config['params']
        
        print(f"Training {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=estimator,
            param_grid=param_grid,
            cv=cv,
            scoring='r2',
            n_jobs=-1,  # Utilize all available cores
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        
        # To get MAE and MSE, perform cross-validation predictions
        # Since GridSearchCV with scoring='r2' does not provide MAE and MSE, we'll compute them manually
        # Alternatively, you can use multiple scoring metrics in GridSearchCV
        # For simplicity, we'll use cross_val_predict here
        
        # Re-train the best estimator on the entire dataset
        best_estimator = grid_search.best_estimator_
        best_estimator.fit(X, y)
        y_pred = best_estimator.predict(X)
        
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        results.append({
            'Model': model_name,
            'Parameters': best_params,
            'MAE': mae,
            'MSE': mse,
            'R2': r2
        })
        
        print(f"{model_name} Best Params: {best_params} -> MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
    
    return results

# 13. Save Results to CSV
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    
    Parameters:
        results (list): List of dictionaries containing model details and performance metrics.
        filename (str): Name of the CSV file to save the results.
    """
    if not results:
        print("No results to save.")
        return
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 14. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'perovskite_database_query.csv'
    
    # 1. Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # 2. Define layer columns and their corresponding layer names
    layer_columns = define_layer_columns()
    
    # 3. Parse sequences from the specified columns
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print(f"Parsed sequences from columns: {list(layer_columns.keys())}")
    
    # 4. Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # 5. Get list of unique materials
    materials = list(model.wv.index_to_key)
    print(f"Number of unique materials in specified stack sequences: {len(materials)}")
    
    # 6. Assign primary layers to materials
    material_primary_layer = assign_primary_layers(material_layer_map)
    
    # 7. Assign colors to layers
    layer_colors = assign_colors_to_layers(layer_names)
    
    # 8. Extract embeddings
    embeddings = extract_embeddings(model, materials)
    
    # 9. Aggregate embeddings for each sample
    aggregated_features = aggregate_embeddings(tokenized_sequences, model, vector_size=model.vector_size)
    print("Aggregated embeddings for each sample.")
    
    # 10. Prepare feature matrix and target vector (Handle Missing Targets)
    X, y = prepare_features_targets(aggregated_features, data, target_column='JV_default_PCE')
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}.")
    
    # 11. Define models and hyperparameters
    models = define_models_hyperparameters()
    
    # 12. Train and evaluate models using GridSearchCV
    print("Starting model training and evaluation...")
    results = train_evaluate_models_with_grid_search(X, y, models, cv=3)
    
    # 13. Save results to CSV
    save_results_to_csv(results, filename='model_results.csv')
    
    # Optional: Save Word2Vec model for future use
    model.save("word2vec_model_full_stack_sequence.model")
    print("Word2Vec model saved as 'word2vec_model_full_stack_sequence.model'.")
    
    # Optional: Save aggregated features and target for future use
    np.save('aggregated_features.npy', X)
    np.save('target_vector.npy', y)
    print("Aggregated features and target vector saved.")
    
    # # 14. Visualize Embeddings (Optional)
    # # PCA Visualization
    # pca = PCA(n_components=2, random_state=42)
    # reduced_embeddings_pca = pca.fit_transform(embeddings)
    # plot_embeddings_colored(
    #     embeddings_2d=reduced_embeddings_pca,
    #     materials=materials,
    #     material_primary_layer=material_primary_layer,
    #     layer_colors=layer_colors,
    #     title='Material Embeddings with PCA (Colored by Layers)',
    #     annotate=False  # Set to True to display annotations
    # )
    
    # # t-SNE Visualization
    # tsne = TSNE(n_components=2, perplexity=5, random_state=42, init='random', learning_rate='auto')
    # reduced_embeddings_tsne = tsne.fit_transform(embeddings)
    # plot_embeddings_colored(
    #     embeddings_2d=reduced_embeddings_tsne,
    #     materials=materials,
    #     material_primary_layer=material_primary_layer,
    #     layer_colors=layer_colors,
    #     title='Material Embeddings with t-SNE (Colored by Layers)',
    #     annotate=False  # Set to True to display annotations
    # )

if __name__ == "__main__":
    main()


Loaded data with 29560 samples.
Parsed sequences from columns: ['Cell_stack_sequence', 'Substrate_stack_sequence', 'ETL_stack_sequence', 'HTL_stack_sequence', 'Backcontact_stack_sequence', 'Add_lay_back_stack_sequence', 'Encapsulation_stack_sequence']
Trained Word2Vec model.
Number of unique materials in specified stack sequences: 2149
Aggregated embeddings for each sample.
Dropped 624 samples due to NaN in target 'JV_default_PCE'.
Prepared feature matrix with shape (28936, 50) and target vector with shape (28936,).
Starting model training and evaluation...
Training RandomForest...


KeyboardInterrupt: 

In [7]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
import csv

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    """
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    """
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

def extract_composition_features(dataframe):
    """
    Extracts and processes perovskite composition features.
    Ensures all samples have the same number of features.
    """
    composition_columns = [
        'Perovskite_composition_a_ions',
        'Perovskite_composition_a_ions_coefficients',
        'Perovskite_composition_b_ions',
        'Perovskite_composition_b_ions_coefficients',
        'Perovskite_composition_c_ions',
        'Perovskite_composition_c_ions_coefficients'
    ]
    
    # First pass: determine maximum number of coefficients for each ion type
    max_coeffs = {'a': 0, 'b': 0, 'c': 0}
    
    for idx, row in dataframe.iterrows():
        for i, ion_type in enumerate(['a', 'b', 'c']):
            coeffs_str = str(row[composition_columns[i*2 + 1]]).split(';')
            max_coeffs[ion_type] = max(max_coeffs[ion_type], len(coeffs_str))
    
    total_features = sum(max_coeffs.values())
    processed_features = np.zeros((len(dataframe), total_features))
    
    # Second pass: fill in the features array
    for idx, row in dataframe.iterrows():
        feature_idx = 0
        
        # Process each pair of ion and coefficient columns
        for i, ion_type in enumerate(['a', 'b', 'c']):
            ions = str(row[composition_columns[i*2]]).split(';')
            coeffs_str = str(row[composition_columns[i*2 + 1]]).split(';')
            
            # Convert coefficients to float, using 0.0 for any invalid values
            coeffs = []
            for coeff in coeffs_str:
                try:
                    coeffs.append(float(coeff.strip()))
                except (ValueError, AttributeError):
                    coeffs.append(0.0)
            
            # Pad with zeros if needed
            while len(coeffs) < max_coeffs[ion_type]:
                coeffs.append(0.0)
            
            # Add coefficients to features array
            processed_features[idx, feature_idx:feature_idx + len(coeffs)] = coeffs
            feature_idx += max_coeffs[ion_type]
    
    return processed_features

# 6. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 7. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Now includes both embeddings and composition features.
    """
    # Get composition features
    composition_features = extract_composition_features(dataframe)
    
    # Combine embeddings with composition features
    combined_features = np.hstack([aggregated_features, composition_features])
    
    # Handle missing values in the combined features
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(combined_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 8. Define Models and Hyperparameters
def define_models_hyperparameters():
    """
    Defines the models and their corresponding hyperparameters for optimization.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'max_depth': [None, 10],
                'min_samples_split': [2, 4],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 4],
                'min_samples_split': [2, 4]
            }
        }
    }
    return models

# 9. Train and Evaluate Models
def train_evaluate_models_with_grid_search(X, y, models, cv=3):
    """
    Trains and evaluates models using GridSearchCV for hyperparameter tuning.
    """
    results = []
    
    for model_name, config in models.items():
        print(f"Training {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=cv,
            scoring='r2',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
        best_estimator.fit(X, y)
        y_pred = best_estimator.predict(X)
        
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        results.append({
            'Model': model_name,
            'Parameters': best_params,
            'MAE': mae,
            'MSE': mse,
            'R2': r2
        })
        
        print(f"{model_name} Best Params: {best_params}")
        print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
    
    return results

# 10. Save Results
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    """
    if not results:
        print("No results to save.")
        return
    
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 11. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'perovskite_database_query.csv'
    
    # Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # Define layer columns
    layer_columns = define_layer_columns()
    
    # Parse sequences
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print("Parsed sequences from columns.")
    
    # Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # Aggregate embeddings
    aggregated_features = aggregate_embeddings(tokenized_sequences, model)
    print("Aggregated embeddings for each sample.")
    
    # Prepare features and target
    X, y = prepare_features_targets(aggregated_features, data)
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}")
    
    # Define models and train
    models = define_models_hyperparameters()
    print("Starting model training and evaluation...")
    results = train_evaluate_models_with_grid_search(X, y, models, cv=3)
    
    # Save results
    save_results_to_csv(results)
    
    # Save Word2Vec model
    model.save("word2vec_model_full_stack_sequence.model")
    print("Word2Vec model saved as 'word2vec_model_full_stack_sequence.model'")

if __name__ == "__main__":
    main()

Loaded data with 29560 samples.
Parsed sequences from columns.
Trained Word2Vec model.
Aggregated embeddings for each sample.
Dropped 624 samples due to NaN in target 'JV_default_PCE'.
Prepared feature matrix with shape (28936, 61) and target vector with shape (28936,)
Starting model training and evaluation...
Training RandomForest...
RandomForest Best Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 150}
MAE: 2.9252, MSE: 14.4245, R2: 0.4897
Training GradientBoosting...
GradientBoosting Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 2, 'n_estimators': 150}
MAE: 3.0452, MSE: 15.2214, R2: 0.4615
Results saved to model_results.csv
Word2Vec model saved as 'word2vec_model_full_stack_sequence.model'


In [2]:
import pandas as pd
import re

# Load the CSV file
file_path = "perovskite_database_query.csv"
data = pd.read_csv(file_path)

# Define the columns to keep
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence'
]

# Filter columns to keep only those that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in data.columns]
data = data[existing_columns]

data.columns = data.columns.str.strip()

# # Add an index column
# data.reset_index(inplace=True)
# data.rename(columns={'index': 'Index'}, inplace=True)

# # Save the filtered dataset to a new CSV file
# output_path = 'filtered_DatabaseMaterials_with_index.csv'
# data.to_csv(output_path, index=False)
# print("Filtered dataset with index saved as", output_path)

# Create a dataframe for ions and their coefficients
ion_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients'
]

ion_data = data[ion_columns]

# # Save the unchanged ion data
# output_path = 'ion_data_unchanged.csv'
# ion_data.to_csv(output_path, index=False)
# print("Unchanged ion data saved as", output_path)

# Function to clean molecule names
def clean_molecule_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s\-()]+', ' ', name.strip())
    name = re.sub(r'\s+', ' ', name).strip()
    elements = [element for element in name.split() if element and not element.replace('.', '', 1).isdigit()]
    return elements

# Function to clean and convert coefficients to floats
def clean_and_convert_coefficient(coefficient):
    try:
        cleaned_coefficient = re.sub(r'[^0-9.eE-]', '', coefficient.replace(',', '').strip())
        return float(cleaned_coefficient) if cleaned_coefficient else 0.0
    except ValueError:
        return 0.0

# Function to normalize coefficients
def normalize_coefficients(cell):
    if pd.notna(cell):
        try:
            coefficients = [float(x.strip()) for x in re.split(r'[;|]', cell) if x.strip()]
            total_sum = sum(coefficients)
            return ';'.join(f"{val / total_sum:.3f}" for val in coefficients) if total_sum > 0 else cell
        except ValueError:
            return cell
    return cell

# Normalize coefficients in each column
coefficient_columns = [
    'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions_coefficients', 
    'Perovskite_composition_c_ions_coefficients'
]

for col in coefficient_columns:
    data[col] = data[col].apply(normalize_coefficients)

# Create a set of unique molecules and add new columns
unique_molecules = set()
for column_group in ['a', 'b', 'c']:
    ions_column = f'Perovskite_composition_{column_group}_ions'
    coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
    for _, row in data.iterrows():
        ions, _ = clean_molecule_name(str(row[ions_column])), [clean_and_convert_coefficient(c) for c in str(row[coefficients_column]).split(';')]
        unique_molecules.update(ions)

# Create columns for each unique molecule and calculate proportions
for molecule in unique_molecules:
    data[molecule] = 0.0

for index, row in data[coefficient_columns].iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row[coefficients_column]).split(';')]
        total_coeff = sum(coefficients) if sum(coefficients) != 0 else 1
        
        for ion, coeff in zip(ions, coefficients):
            data.at[index, ion] += coeff / total_coeff


# Create a new column 'Layer_Type' to indicate if the row is multilayered or single-layered

data['Layer Type'] = data.apply(
    lambda row: 'Multi-layered Perovskite' if any('|' in str(row[col]) for col in ion_columns) else 'Single-layered Perovskite',
    axis=1
)

# Add/append columns for 'combined ions' and 'combined coefficients' - vector embedding

data['combined_ions'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions', '')},{row.get('Perovskite_composition_b_ions', '')},{row.get('Perovskite_composition_c_ions', '')}", 
    axis=1
)

data['combined_coefficients'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions_coefficients', '')},{row.get('Perovskite_composition_b_ions_coefficients', '')},{row.get('Perovskite_composition_c_ions_coefficients', '')}", 
    axis=1
)

# ### make sure all combined ions and combined coefficients are lists

import re

# Function to convert a string with mixed delimiters to a list
def convert_to_list(entry):
    if isinstance(entry, str):
        # Replace semicolons and pipes with commas for uniformity
        entry = re.sub(r'[;|]', ',', entry)
        # Split the string by commas and strip spaces around each item
        return [item.strip() for item in entry.split(',') if item.strip()]
    elif isinstance(entry, list):
        return entry  # Already a list, no action needed
    else:
        return []  # Handle missing or invalid entries

# Function to convert string entries to float and handle non-numeric values
def safe_convert_to_float(entry):
    try:
        return float(entry)  # Attempt to convert to float
    except ValueError:
        return None  # If conversion fails, return None (or handle as needed)


### add an indication of perovskite site

# Add the 'combined_sites' column
def generate_combined_sites(row):
    # Split combined ions and coefficients into lists
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    
    # Assign sites ('a', 'b', 'c') based on the origin of each ion/coefficient
    sites = []
    site_labels = ['a', 'b', 'c']
    for site, ions_col, coeff_col in zip(site_labels, 
                                         ['Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 'Perovskite_composition_c_ions'], 
                                         ['Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions_coefficients']):
        # Count the number of ions and coefficients from this site
        num_ions = len(clean_molecule_name(str(row.get(ions_col, ""))))
        num_coefficients = len(str(row.get(coeff_col, "")).split(';'))
        
        # Append the site label for each ion/coefficient from this site
        sites.extend([site] * max(num_ions, num_coefficients))
    
    return sites

# Apply the function to generate the 'combined_sites' column
data['combined_sites'] = data.apply(generate_combined_sites, axis=1)

def clean_coefficients(coefficients):
    """
    Cleans the coefficients by ensuring all values are numeric.
    Invalid or non-numeric values are replaced with 0.0.
    If the value is already a float, it is left unchanged.
    """
    cleaned = []
    for c in coefficients:
        if isinstance(c, float):  # If already a float, keep it as is
            cleaned.append(c)
        elif isinstance(c, str) and c.replace('.', '', 1).isdigit():  # If a valid string representation of a number
            cleaned.append(float(c))
        else:  # For invalid entries
            cleaned.append(0.0)
    return cleaned

def normalize_coefficients_within_cell(row):
    """
    Normalizes the coefficients for each site ('a', 'b', 'c') within a cell.
    Ensures that the sum of coefficients for each site equals 1.
    """
    # Extract ions, coefficients, and sites for the row
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = row['combined_sites']
    
    # Initialize lists for each site
    site_a_coeffs = []
    site_b_coeffs = []
    site_c_coeffs = []
    
    # Separate the coefficients by their sites
    for coeff, site in zip(coefficients, sites):
        try:
            coeff = float(coeff)  # Ensure coefficients are numeric
        except ValueError:
            coeff = 0.0  # Default to 0.0 if invalid
        if site == 'a':
            site_a_coeffs.append(coeff)
        elif site == 'b':
            site_b_coeffs.append(coeff)
        elif site == 'c':
            site_c_coeffs.append(coeff)
    
    # Normalize the coefficients for each site if their sum is not zero
    def normalize(site_coeffs):
        total = sum(site_coeffs)
        return [coeff / total for coeff in site_coeffs] if total > 0 else site_coeffs
    
    site_a_coeffs = normalize(site_a_coeffs)
    site_b_coeffs = normalize(site_b_coeffs)
    site_c_coeffs = normalize(site_c_coeffs)
    
    # Combine all coefficients back into a single list
    normalized_coeffs = site_a_coeffs + site_b_coeffs + site_c_coeffs

    return normalized_coeffs



# Apply the function to both columns
data['combined_ions'] = data['combined_ions'].apply(convert_to_list)
data['combined_coefficients'] = data['combined_coefficients'].apply(
    lambda x: [safe_convert_to_float(item) for item in convert_to_list(x)]  # Convert to float for coefficients, handle errors
)

# Step 1: Clean the coefficients column
data['combined_coefficients'] = data['combined_coefficients'].apply(clean_coefficients)

# Step 2: Normalize coefficients within each cell
data['combined_coefficients'] = data.apply(normalize_coefficients_within_cell, axis=1)

# Verify the transformation
print(data[['combined_ions', 'combined_coefficients', 'combined_sites']].head())


# Drop the original ion columns
data = data.drop(columns=ion_columns, errors='ignore')

# Save the modified DataFrame with the 'Layer Type' and combined columns
output_file_path = 'data_with_layer_type_and_combined.csv'
data.to_csv(output_file_path, index=False)
print("CSV file with layer type information modified and saved as:", output_file_path)

  data = pd.read_csv(file_path)
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  

     combined_ions combined_coefficients combined_sites
0      [Cs, Sn, I]       [1.0, 1.0, 1.0]      [a, b, c]
1  [Cs, Sn, Br, I]  [1.0, 1.0, 0.1, 0.9]   [a, b, c, c]
2  [Cs, Sn, Br, I]  [1.0, 1.0, 0.5, 0.5]   [a, b, c, c]
3  [Cs, Sn, Br, I]  [1.0, 1.0, 0.9, 0.1]   [a, b, c, c]
4     [Cs, Sn, Br]       [1.0, 1.0, 1.0]      [a, b, c]
CSV file with layer type information modified and saved as: data_with_layer_type_and_combined.csv


In [3]:
import pandas as pd
import re

# Load the CSV file
file_path = "perovskite_database_query.csv"
data = pd.read_csv(file_path)

# Define the columns to keep
columns_to_keep = [
    'Cell_stack_sequence', 'Cell_architecture',
    'Substrate_stack_sequence', 'Substrate_thickness',
    'ETL_stack_sequence', 'ETL_thickness', 'ETL_additives_compounds', 'ETL_additives_concentrations',
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients', 
    'Perovskite_additives_compounds', 'Perovskite_additives_concentrations', 'Perovskite_thickness',
    'HTL_stack_sequence', 'HTL_thickness_list', 'HTL_additives_compounds', 'HTL_additives_concentrations',
    'Backcontact_stack_sequence', 'Backcontact_thickness', 
    'Backcontact_additives_compounds', 'Backcontact_additives_concentrations',
    'Add_lay_front', 'Add_lay_front_function', 'Add_lay_front_stack_sequence', 'Add_lay_front_thickness_list', 
    'Add_lay_front_additives_compounds', 'Add_lay_front_additives_concentrations',
    'Add_lay_back', 'Add_lay_back_function', 'Add_lay_back_stack_sequence', 'Add_lay_back_thickness_list', 
    'Add_lay_back_additives_compounds', 'Add_lay_back_additives_concentrations',
    'Encapsulation', 'Encapsulation_stack_sequence',
    'JV_default_PCE'  # Added the target column
]

# Filter columns to keep only those that exist in the dataset
existing_columns = [col for col in columns_to_keep if col in data.columns]
data = data[existing_columns]

data.columns = data.columns.str.strip()

# Function to clean molecule names
def clean_molecule_name(name):
    name = re.sub(r'[^a-zA-Z0-9\s\-()]+', ' ', name.strip())
    name = re.sub(r'\s+', ' ', name).strip()
    elements = [element for element in name.split() if element and not element.replace('.', '', 1).isdigit()]
    return elements

# Function to clean and convert coefficients to floats
def clean_and_convert_coefficient(coefficient):
    try:
        cleaned_coefficient = re.sub(r'[^0-9.eE-]', '', coefficient.replace(',', '').strip())
        return float(cleaned_coefficient) if cleaned_coefficient else 0.0
    except ValueError:
        return 0.0

# Function to normalize coefficients
def normalize_coefficients(cell):
    if pd.notna(cell):
        try:
            coefficients = [float(x.strip()) for x in re.split(r'[;|]', cell) if x.strip()]
            total_sum = sum(coefficients)
            return ';'.join(f"{val / total_sum:.3f}" for val in coefficients) if total_sum > 0 else cell
        except ValueError:
            return cell
    return cell

# Normalize coefficients in each column
coefficient_columns = [
    'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions_coefficients', 
    'Perovskite_composition_c_ions_coefficients'
]

for col in coefficient_columns:
    data[col] = data[col].apply(normalize_coefficients)

# Create a set of unique molecules and add new columns
unique_molecules = set()
for index, row in data.iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row.get(coefficients_column, "")).split(';')]
        unique_molecules.update(ions)

# Create columns for each unique molecule and initialize to zero
for molecule in unique_molecules:
    data[molecule] = 0.0

# Populate the molecule columns with coefficients
for index, row in data.iterrows():
    for column_group in ['a', 'b', 'c']:
        ions_column = f'Perovskite_composition_{column_group}_ions'
        coefficients_column = f'Perovskite_composition_{column_group}_ions_coefficients'
        ions = clean_molecule_name(str(row.get(ions_column, "")))
        coefficients = [clean_and_convert_coefficient(c) for c in str(row.get(coefficients_column, "")).split(';')]
        total_coeff = sum(coefficients) if sum(coefficients) != 0 else 1
        for ion, coeff in zip(ions, coefficients):
            data.at[index, ion] += coeff / total_coeff

# Create a new column 'Layer Type' to indicate if the row is multilayered or single-layered
ion_columns = [
    'Perovskite_composition_a_ions', 'Perovskite_composition_a_ions_coefficients', 
    'Perovskite_composition_b_ions', 'Perovskite_composition_b_ions_coefficients',
    'Perovskite_composition_c_ions', 'Perovskite_composition_c_ions_coefficients'
]

data['Layer Type'] = data.apply(
    lambda row: 'Multi-layered Perovskite' if any('|' in str(row[col]) for col in ion_columns) else 'Single-layered Perovskite',
    axis=1
)

# Add combined ions and coefficients columns
data['combined_ions'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions', '')},{row.get('Perovskite_composition_b_ions', '')},{row.get('Perovskite_composition_c_ions', '')}", 
    axis=1
)

data['combined_coefficients'] = data.apply(
    lambda row: f"{row.get('Perovskite_composition_a_ions_coefficients', '')},{row.get('Perovskite_composition_b_ions_coefficients', '')},{row.get('Perovskite_composition_c_ions_coefficients', '')}", 
    axis=1
)

# Convert combined columns to lists and clean coefficients
import re

def convert_to_list(entry):
    if isinstance(entry, str):
        entry = re.sub(r'[;|]', ',', entry)
        return [item.strip() for item in entry.split(',') if item.strip()]
    elif isinstance(entry, list):
        return entry
    else:
        return []

def safe_convert_to_float(entry):
    try:
        return float(entry)
    except ValueError:
        return 0.0

def generate_combined_sites(row):
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = []
    site_labels = ['a', 'b', 'c']
    for site, ions_col, coeff_col in zip(site_labels, 
                                         ['Perovskite_composition_a_ions', 'Perovskite_composition_b_ions', 'Perovskite_composition_c_ions'], 
                                         ['Perovskite_composition_a_ions_coefficients', 'Perovskite_composition_b_ions_coefficients', 'Perovskite_composition_c_ions_coefficients']):
        num_ions = len(clean_molecule_name(str(row.get(ions_col, ""))))
        num_coefficients = len(str(row.get(coeff_col, "")).split(';'))
        sites.extend([site] * max(num_ions, num_coefficients))
    return sites

data['combined_ions'] = data['combined_ions'].apply(convert_to_list)
data['combined_coefficients'] = data['combined_coefficients'].apply(
    lambda x: [safe_convert_to_float(item) for item in convert_to_list(x)]
)
data['combined_sites'] = data.apply(generate_combined_sites, axis=1)

def clean_coefficients(coefficients):
    cleaned = []
    for c in coefficients:
        if isinstance(c, float):
            cleaned.append(c)
        elif isinstance(c, str) and c.replace('.', '', 1).isdigit():
            cleaned.append(float(c))
        else:
            cleaned.append(0.0)
    return cleaned

def normalize_coefficients_within_cell(row):
    ions = row['combined_ions']
    coefficients = row['combined_coefficients']
    sites = row['combined_sites']
    site_a_coeffs = []
    site_b_coeffs = []
    site_c_coeffs = []
    for coeff, site in zip(coefficients, sites):
        try:
            coeff = float(coeff)
        except ValueError:
            coeff = 0.0
        if site == 'a':
            site_a_coeffs.append(coeff)
        elif site == 'b':
            site_b_coeffs.append(coeff)
        elif site == 'c':
            site_c_coeffs.append(coeff)
    def normalize(site_coeffs):
        total = sum(site_coeffs)
        return [coeff / total if total > 0 else 0.0 for coeff in site_coeffs]
    site_a_coeffs = normalize(site_a_coeffs)
    site_b_coeffs = normalize(site_b_coeffs)
    site_c_coeffs = normalize(site_c_coeffs)
    normalized_coeffs = site_a_coeffs + site_b_coeffs + site_c_coeffs
    return normalized_coeffs

data['combined_coefficients'] = data['combined_coefficients'].apply(clean_coefficients)
data['combined_coefficients'] = data.apply(normalize_coefficients_within_cell, axis=1)

# Drop the original ion columns
data = data.drop(columns=ion_columns, errors='ignore')

# Save the modified DataFrame
output_file_path = 'data_with_layer_type_and_combined.csv'
data.to_csv(output_file_path, index=False)
print("CSV file with layer type information modified and saved as:", output_file_path)


  data = pd.read_csv(file_path)
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  data[molecule] = 0.0
  

CSV file with layer type information modified and saved as: data_with_layer_type_and_combined.csv


In [4]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import csv
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Extract Composition Features
def extract_composition_features(dataframe):
    composition_columns = [
        'Perovskite_composition_a_ions',
        'Perovskite_composition_a_ions_coefficients',
        'Perovskite_composition_b_ions',
        'Perovskite_composition_b_ions_coefficients',
        'Perovskite_composition_c_ions',
        'Perovskite_composition_c_ions_coefficients'
    ]
    max_coeffs = {'a': 0, 'b': 0, 'c': 0}
    for idx, row in dataframe.iterrows():
        for i, ion_type in enumerate(['a', 'b', 'c']):
            coeffs_str = str(row.get(composition_columns[i*2 + 1], "")).split(';')
            max_coeffs[ion_type] = max(max_coeffs[ion_type], len(coeffs_str))
    total_features = sum(max_coeffs.values())
    processed_features = np.zeros((len(dataframe), total_features))
    for idx, row in dataframe.iterrows():
        feature_idx = 0
        for i, ion_type in enumerate(['a', 'b', 'c']):
            ions = str(row.get(composition_columns[i*2], "")).split(';')
            coeffs_str = str(row.get(composition_columns[i*2 + 1], "")).split(';')
            coeffs = []
            for coeff in coeffs_str:
                try:
                    coeffs.append(float(coeff.strip()))
                except (ValueError, AttributeError):
                    coeffs.append(0.0)
            while len(coeffs) < max_coeffs[ion_type]:
                coeffs.append(0.0)
            processed_features[idx, feature_idx:feature_idx + len(coeffs)] = coeffs
            feature_idx += max_coeffs[ion_type]
    return processed_features

# 6. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 7. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE', additional_columns=None):
    composition_features = extract_composition_features(dataframe)
    combined_features = np.hstack([aggregated_features, composition_features])
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    if additional_columns:
        existing_columns =


SyntaxError: invalid syntax (2447236970.py, line 124)

In [5]:
import pandas as pd
import numpy as np
import re
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import warnings
import csv

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Function to parse molecule columns from the given string
def parse_column_names(s):
    """
    Parses column names from a string, removing parentheses and whitespace.
    """
    # Remove any whitespace and trailing commas
    s = s.strip().rstrip(',')
    # Split on commas
    items = s.split(',')
    # Clean each item
    columns = []
    for item in items:
        item = item.strip()
        # Remove enclosing parentheses, if any
        item = item.strip('()')
        if item:  # Ensure the item is not empty
            columns.append(item)
    return columns

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    """
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    """
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Now includes embeddings and specified molecule columns.
    """
    # Get the specified molecule columns
    molecule_columns_str = '(DAP),(PEI),(ThFA),Sn,S,Tb,Sm,TN,(PPA),(PDMA),(FEA),(PyrEA),OA,(PBA),(PTA),(CPEA),(TEA),(mF1PEA),FA,(BI),IM,(oF1PEA),(1,(PA),(iPA),Mg,Y,PR,(PF6),(ODA),F,BU,(Ada),Ca,NEA,(SCN),(N-EtPy),HA,(MIC1),Br,(AVA),((CH3)3S),(BIM),Mn,MA,(4AMP),(A43),(CH3)3S,(PPEA),(F5PEA),(C4H9N2H6),(5-AVAI),Sr,(DMA),(NEA),CA,Al,(NH4),(4AMPY),PN,3-Pr(NH3)2),Sb,(PDA),(ALA),Nb,Te,TA,(MTEA),(Cl-PEA),(iso-BA),PF6,(DPA),(BYA),DA,Bi,(HTAB),AN,NMABr,(CHMA),(F3EA),In,(6-ACA),GU,(ImEA),(HEA),IA,Aa,(APMim),(C8H17NH3),(Br-PEA),PMA,(MIC2),(PGA),I,(5-AVA),(PEA),K,(BEA),(PMA),Eu,Cl,(3AMP),(F-PEA),PEA,(C6H4NH2),(CH3ND3),(4FPEA),(DAT),(Anyl),(TBA),(4ApyH),Ba,(pF1PEA),(TMA),Rb,(3AMPY),(IEA),nan,(NMA),Ni,(pFPEA),BE,(EU-pyP),(PyEA),(BzDA),Co,(Ace),Hg,Pb,EDA,(oFPEA),Bn,(f-PEA),(C4H9NH3),(CIEA),(mFPEA),BA,DI,(HdA),PDA,(GABA),Cu,PA,DMA,Na,(EPA),(OdA),(THM),Ge,HDA,(BF4),(FPEA),(MIC3),GA,(ThMA),Cs,(BZA),Au,(H-PEA),Ag,SCN,(TFEA),EA,FPEAI,Fe,(n-C3H7NH3),(BdA),(EDA),BDA,Cr,Pt,Ti,(C6H13NH3),(HAD),Li,(BDA),O,La,Zn,'
    
    # Parse the molecule columns
    molecule_columns = parse_column_names(molecule_columns_str)
    
    # Keep only columns that exist in the dataframe
    existing_molecule_columns = [col for col in molecule_columns if col in dataframe.columns]
    missing_columns = [col for col in molecule_columns if col not in dataframe.columns]
    if missing_columns:
        print(f"The following specified molecule columns are not in the dataframe and will be skipped: {missing_columns}")
    
    # Extract the specified molecule columns from the dataframe
    molecule_features = dataframe[existing_molecule_columns]
    
    # Handle missing values in molecule features
    molecule_features = molecule_features.fillna(0.0)
    
    # Combine embeddings and molecule features
    combined_features = np.hstack([aggregated_features, molecule_features.values])
    
    # Handle missing values in the combined features
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(combined_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 6. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 7. Define Models and Hyperparameters
def define_models_hyperparameters():
    """
    Defines the models and their corresponding hyperparameters for optimization.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'max_depth': [None, 10],
                'min_samples_split': [2, 4],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 4],
                'min_samples_split': [2, 4]
            }
        }
    }
    return models

# 8. Train and Evaluate Models
def train_evaluate_models_with_grid_search(X, y, models, cv=3):
    """
    Trains and evaluates models using GridSearchCV for hyperparameter tuning.
    """
    results = []
    
    for model_name, config in models.items():
        print(f"Training {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=cv,
            scoring='r2',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
        best_estimator.fit(X, y)
        y_pred = best_estimator.predict(X)
        
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        results.append({
            'Model': model_name,
            'Parameters': best_params,
            'MAE': mae,
            'MSE': mse,
            'R2': r2
        })
        
        print(f"{model_name} Best Params: {best_params}")
        print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
        
    return results

# 9. Save Results
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    """
    if not results:
        print("No results to save.")
        return
    
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 10. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'data_with_layer_type_and_combined.csv'
    
    # Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # Define layer columns
    layer_columns = define_layer_columns()
    
    # Parse sequences
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print("Parsed sequences from columns.")
    
    # Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # Aggregate embeddings
    aggregated_features = aggregate_embeddings(tokenized_sequences, model)
    print("Aggregated embeddings for each sample.")
    
    # Prepare features and target
    X, y = prepare_features_targets(aggregated_features, data)
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}")
    
    # Define models and train
    models = define_models_hyperparameters()
    print("Starting model training and evaluation...")
    results = train_evaluate_models_with_grid_search(X, y, models, cv=3)
    
    # Save results
    save_results_to_csv(results)
    
    # Save Word2Vec model
    model.save("word2vec_model_full_stack_sequence.model")
    print("Word2Vec model saved as 'word2vec_model_full_stack_sequence.model'")

if __name__ == "__main__":
    main()


Loaded data with 29560 samples.
Parsed sequences from columns.
Trained Word2Vec model.
Aggregated embeddings for each sample.
The following specified molecule columns are not in the dataframe and will be skipped: ['DAP', 'PEI', 'ThFA', 'PPA', 'PDMA', 'FEA', 'PyrEA', 'PBA', 'PTA', 'CPEA', 'TEA', 'mF1PEA', 'BI', 'oF1PEA', '1', 'iPA', 'ODA', 'Ada', 'N-EtPy', 'MIC1', 'AVA', 'CH3)3S', 'BIM', '4AMP', 'A43', 'CH3)3S', 'PPEA', 'F5PEA', 'C4H9N2H6', '5-AVAI', 'NH4', '4AMPY', '3-Pr(NH3)2', 'ALA', 'MTEA', 'Cl-PEA', 'iso-BA', 'DPA', 'BYA', 'HTAB', 'CHMA', 'F3EA', '6-ACA', 'ImEA', 'HEA', 'APMim', 'C8H17NH3', 'Br-PEA', 'MIC2', 'PGA', '5-AVA', 'BEA', '3AMP', 'F-PEA', 'C6H4NH2', 'CH3ND3', '4FPEA', 'DAT', 'Anyl', 'TBA', '4ApyH', 'pF1PEA', 'TMA', '3AMPY', 'IEA', 'NMA', 'pFPEA', 'EU-pyP', 'PyEA', 'BzDA', 'Ace', 'oFPEA', 'f-PEA', 'C4H9NH3', 'CIEA', 'mFPEA', 'HdA', 'GABA', 'EPA', 'OdA', 'THM', 'BF4', 'FPEA', 'MIC3', 'ThMA', 'BZA', 'H-PEA', 'TFEA', 'n-C3H7NH3', 'BdA', 'C6H13NH3', 'HAD']
Dropped 624 samples d

In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import csv
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    """
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    """
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Now includes embeddings and specified molecule columns.
    """
    # Molecule columns as a list
    molecule_columns = [
        '(DAP)', '(PEI)', '(ThFA)', 'Sn', 'S', 'Tb', 'Sm', 'TN', '(PPA)', '(PDMA)', '(FEA)', 
        '(PyrEA)', 'OA', '(PBA)', '(PTA)', '(CPEA)', '(TEA)', '(mF1PEA)', 'FA', '(BI)', 'IM', 
        '(oF1PEA)', '(PA)', '(iPA)', 'Mg', 'Y', 'PR', '(PF6)', '(ODA)', 'F', 'BU', '(Ada)', 
        'Ca', 'NEA', '(SCN)', '(N-EtPy)', 'HA', '(MIC1)', 'Br', '(AVA)', '((CH3)3S)', '(BIM)', 
        'Mn', 'MA', '(4AMP)', '(A43)', '(CH3)3S', '(PPEA)', '(F5PEA)', '(C4H9N2H6)', '(5-AVAI)', 
        'Sr', '(DMA)', 'CA', 'Al', '(NH4)', '(4AMPY)', 'PN', 'Sb', '(PDA)', '(ALA)', 'Nb', 'Te', 
        'TA', '(MTEA)', '(Cl-PEA)', '(iso-BA)', '(DPA)', '(BYA)', 'DA', 'Bi', '(HTAB)', 'AN', 
        'NMABr', '(CHMA)', '(F3EA)', 'In', '(6-ACA)', 'GU', '(ImEA)', '(HEA)', 'IA', 'Aa', 
        '(APMim)', '(C8H17NH3)', '(Br-PEA)', 'PMA', '(MIC2)', '(PGA)', 'I', '(5-AVA)', '(PEA)', 
        'K', '(BEA)', '(PMA)', 'Eu', 'Cl', '(3AMP)', '(F-PEA)', '(C6H4NH2)', '(CH3ND3)', '(4FPEA)', 
        '(DAT)', '(Anyl)', '(TBA)', '(4ApyH)', 'Ba', '(pF1PEA)', '(TMA)', 'Rb', '(3AMPY)', '(IEA)', 
        '(nan)', '(NMA)', 'Ni', '(pFPEA)', '(BE)', '(EU-pyP)', '(PyEA)', '(BzDA)', 'Co', '(Ace)', 
        'Hg', 'Pb', '(EDA)', '(oFPEA)', 'Bn', '(f-PEA)', '(C4H9NH3)', '(CIEA)', '(mFPEA)', 'BA', 
        'DI', '(HdA)', '(PDA)', '(GABA)', 'Cu', 'PA', '(DMA)', 'Na', '(EPA)', '(OdA)', '(THM)', 
        'Ge', '(HDA)', '(BF4)', '(FPEA)', '(MIC3)', 'GA', '(ThMA)', 'Cs', '(BZA)', 'Au', '(H-PEA)', 
        'Ag', '(SCN)', '(TFEA)', 'EA', 'FPEAI', 'Fe', '(n-C3H7NH3)', '(BdA)', '(EDA)', 'BDA', 'Cr', 
        'Pt', 'Ti', '(C6H13NH3)', '(HAD)', 'Li', '(BDA)', 'O', 'La', 'Zn'
    ]
    
    # Keep only columns that exist in the dataframe
    existing_molecule_columns = [col for col in molecule_columns if col in dataframe.columns]
    missing_columns = [col for col in molecule_columns if col not in dataframe.columns]
    if missing_columns:
        print(f"The following specified molecule columns are not in the dataframe and will be skipped: {missing_columns}")
    
    # Extract the specified molecule columns from the dataframe
    molecule_features = dataframe[existing_molecule_columns]
    
    # Handle missing values in molecule features
    molecule_features = molecule_features.fillna(0.0)
    
    # Combine embeddings and molecule features
    combined_features = np.hstack([aggregated_features, molecule_features.values])
    
    # Handle missing values in the combined features
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(combined_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 6. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'data_with_layer_type_and_combined.csv'
    
    # Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # Define layer columns
    layer_columns = define_layer_columns()
    
    # Parse sequences
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print("Parsed sequences from columns.")
    
    # Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # Aggregate embeddings
    aggregated_features = aggregate_embeddings(tokenized_sequences, model)
    print("Aggregated embeddings for each sample.")
    
    # Prepare features and target
    X, y = prepare_features_targets(aggregated_features, data)
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}")
    
    # Model training logic can be added here

if __name__ == "__main__":
    main()


Loaded data with 29560 samples.
Parsed sequences from columns.
Trained Word2Vec model.
Aggregated embeddings for each sample.
The following specified molecule columns are not in the dataframe and will be skipped: ['(nan)', '(BE)', '(HDA)']
Dropped 624 samples due to NaN in target 'JV_default_PCE'.
Prepared feature matrix with shape (28936, 221) and target vector with shape (28936,)


In [9]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import csv
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load and Filter Data
def load_and_filter_data(file_path):
    """
    Loads the dataset from the specified CSV file and filters rows where 'Cell_architecture' is 'nip'.
    """
    data = pd.read_csv(file_path)
    data = data[data['Cell_architecture'].str.strip().str.lower() == 'nip']
    data = data.reset_index(drop=True)
    return data

# 2. Define Layer Columns
def define_layer_columns():
    """
    Defines the mapping between stack sequence columns and their corresponding layer names.
    """
    layer_columns = {
        'Cell_stack_sequence': 'Cell',
        'Substrate_stack_sequence': 'Substrate',
        'ETL_stack_sequence': 'ETL',
        'HTL_stack_sequence': 'HTL',
        'Backcontact_stack_sequence': 'Backcontact',
        'Add_lay_back_stack_sequence': 'Add_Lay_Back',
        'Encapsulation_stack_sequence': 'Encapsulation'
    }
    return layer_columns

# 3. Parse Sequences from Multiple Columns
def parse_sequences_from_columns(dataframe, layer_columns):
    """
    Parses material sequences from multiple layer-specific columns and maps materials to their layers.
    """
    sequences = []
    material_layer_map = {}
    layer_names = list(layer_columns.values())
    
    for idx, row in dataframe.iterrows():
        sequence = []
        for col, layer_name in layer_columns.items():
            seq_str = row.get(col, "")
            if pd.isna(seq_str) or not seq_str.strip():
                continue
            sub_layers = seq_str.split(' | ')
            for sub_layer in sub_layers:
                materials = [material.strip() for material in sub_layer.split('; ') if material.strip()]
                sequence.extend(materials)
                for material in materials:
                    if material not in material_layer_map:
                        material_layer_map[material] = {}
                    if layer_name not in material_layer_map[material]:
                        material_layer_map[material][layer_name] = 0
                    material_layer_map[material][layer_name] += 1
        sequences.append(sequence)
    
    return sequences, material_layer_map, layer_names

# 4. Train Word2Vec Model
def train_word2vec(sequences, vector_size=50, window=5, min_count=1, workers=4, sg=1):
    """
    Trains a Word2Vec model on the provided material sequences.
    """
    model = Word2Vec(
        sentences=sequences,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers,
        sg=sg
    )
    return model

# 5. Aggregate Embeddings for Each Sample
def aggregate_embeddings(sequences, model, vector_size=50):
    """
    Aggregates material embeddings for each sample by averaging.
    """
    aggregated_features = []
    for seq in sequences:
        if len(seq) == 0:
            aggregated_features.append(np.zeros(vector_size))
            continue
        vectors = [model.wv[material] for material in seq if material in model.wv]
        if vectors:
            aggregated = np.mean(vectors, axis=0)
        else:
            aggregated = np.zeros(vector_size)
        aggregated_features.append(aggregated)
    return np.array(aggregated_features)

# 6. Prepare Features and Targets
def prepare_features_targets(aggregated_features, dataframe, target_column='JV_default_PCE'):
    """
    Prepares the feature matrix and target vector for model training.
    Includes embeddings and specified molecule columns.
    """
    # Molecule columns as a list
    molecule_columns = [
        '(DAP)', '(PEI)', '(ThFA)', 'Sn', 'S', 'Tb', 'Sm', 'TN', '(PPA)', '(PDMA)', '(FEA)', 
        '(PyrEA)', 'OA', '(PBA)', '(PTA)', '(CPEA)', '(TEA)', '(mF1PEA)', 'FA', '(BI)', 'IM', 
        '(oF1PEA)', '(PA)', '(iPA)', 'Mg', 'Y', 'PR', '(PF6)', '(ODA)', 'F', 'BU', '(Ada)', 
        'Ca', 'NEA', '(SCN)', '(N-EtPy)', 'HA', '(MIC1)', 'Br', '(AVA)', '((CH3)3S)', '(BIM)', 
        'Mn', 'MA', '(4AMP)', '(A43)', '(CH3)3S', '(PPEA)', '(F5PEA)', '(C4H9N2H6)', '(5-AVAI)', 
        'Sr', '(DMA)', 'CA', 'Al', '(NH4)', '(4AMPY)', 'PN', 'Sb', '(PDA)', '(ALA)', 'Nb', 'Te', 
        'TA', '(MTEA)', '(Cl-PEA)', '(iso-BA)', '(DPA)', '(BYA)', 'DA', 'Bi', '(HTAB)', 'AN', 
        'NMABr', '(CHMA)', '(F3EA)', 'In', '(6-ACA)', 'GU', '(ImEA)', '(HEA)', 'IA', 'Aa', 
        '(APMim)', '(C8H17NH3)', '(Br-PEA)', 'PMA', '(MIC2)', '(PGA)', 'I', '(5-AVA)', '(PEA)', 
        'K', '(BEA)', '(PMA)', 'Eu', 'Cl', '(3AMP)', '(F-PEA)', '(C6H4NH2)', '(CH3ND3)', '(4FPEA)', 
        '(DAT)', '(Anyl)', '(TBA)', '(4ApyH)', 'Ba', '(pF1PEA)', '(TMA)', 'Rb', '(3AMPY)', '(IEA)', 
        '(nan)', '(NMA)', 'Ni', '(pFPEA)', '(BE)', '(EU-pyP)', '(PyEA)', '(BzDA)', 'Co', '(Ace)', 
        'Hg', 'Pb', '(EDA)', '(oFPEA)', 'Bn', '(f-PEA)', '(C4H9NH3)', '(CIEA)', '(mFPEA)', 'BA', 
        'DI', '(HdA)', '(PDA)', '(GABA)', 'Cu', 'PA', '(DMA)', 'Na', '(EPA)', '(OdA)', '(THM)', 
        'Ge', '(HDA)', '(BF4)', '(FPEA)', '(MIC3)', 'GA', '(ThMA)', 'Cs', '(BZA)', 'Au', '(H-PEA)', 
        'Ag', '(SCN)', '(TFEA)', 'EA', 'FPEAI', 'Fe', '(n-C3H7NH3)', '(BdA)', '(EDA)', 'BDA', 'Cr', 
        'Pt', 'Ti', '(C6H13NH3)', '(HAD)', 'Li', '(BDA)', 'O', 'La', 'Zn'
    ]
    
    # Keep only columns that exist in the dataframe
    existing_molecule_columns = [col for col in molecule_columns if col in dataframe.columns]
    missing_columns = [col for col in molecule_columns if col not in dataframe.columns]
    if missing_columns:
        print(f"The following specified molecule columns are not in the dataframe and will be skipped: {missing_columns}")
    
    # Extract the specified molecule columns from the dataframe
    molecule_features = dataframe[existing_molecule_columns]
    
    # Handle missing values in molecule features
    molecule_features = molecule_features.fillna(0.0)
    
    # Combine embeddings and molecule features
    combined_features = np.hstack([aggregated_features, molecule_features.values])
    
    # Handle missing values in the combined features
    combined_features = np.nan_to_num(combined_features, nan=0.0)
    
    # Combine features and target into a DataFrame for easier handling
    feature_df = pd.DataFrame(combined_features)
    target_series = dataframe[target_column]
    
    # Concatenate features and target
    combined_df = pd.concat([feature_df, target_series], axis=1)
    
    # Drop rows where target is NaN
    initial_count = combined_df.shape[0]
    combined_df = combined_df.dropna(subset=[target_column])
    final_count = combined_df.shape[0]
    dropped = initial_count - final_count
    if dropped > 0:
        print(f"Dropped {dropped} samples due to NaN in target '{target_column}'.")
    
    # Separate features and target
    X = combined_df.drop(columns=[target_column]).values
    y = combined_df[target_column].values
    
    return X, y

# 7. Train and Evaluate Models
def train_evaluate_models_with_grid_search(X, y, cv=3):
    """
    Trains and evaluates models using GridSearchCV for hyperparameter tuning.
    """
    models = {
        'RandomForest': {
            'model': RandomForestRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'max_depth': [None, 10],
                'min_samples_split': [2, 4],
                'min_samples_leaf': [1, 2]
            }
        },
        'GradientBoosting': {
            'model': GradientBoostingRegressor(random_state=42),
            'params': {
                'n_estimators': [100, 150],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 4],
                'min_samples_split': [2, 4]
            }
        }
    }
    results = []
    
    for model_name, config in models.items():
        print(f"Training {model_name}...")
        
        grid_search = GridSearchCV(
            estimator=config['model'],
            param_grid=config['params'],
            cv=cv,
            scoring='r2',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X, y)
        
        best_params = grid_search.best_params_
        best_estimator = grid_search.best_estimator_
        y_pred = best_estimator.predict(X)
        
        mae = mean_absolute_error(y, y_pred)
        mse = mean_squared_error(y, y_pred)
        r2 = r2_score(y, y_pred)
        
        results.append({
            'Model': model_name,
            'Parameters': best_params,
            'MAE': mae,
            'MSE': mse,
            'R2': r2
        })
        
        print(f"{model_name} Best Params: {best_params}")
        print(f"MAE: {mae:.4f}, MSE: {mse:.4f}, R2: {r2:.4f}")
    
    return results

# 8. Save Results
def save_results_to_csv(results, filename='model_results.csv'):
    """
    Saves the model training results to a CSV file.
    """
    if not results:
        print("No results to save.")
        return
    
    keys = results[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Results saved to {filename}")

# 9. Main Execution Function
def main():
    # File path to the CSV dataset
    file_path = 'data_with_layer_type_and_combined.csv'
    
    # Load and filter data
    data = load_and_filter_data(file_path)
    print(f"Loaded data with {data.shape[0]} samples.")
    
    # Define layer columns
    layer_columns = define_layer_columns()
    
    # Parse sequences
    tokenized_sequences, material_layer_map, layer_names = parse_sequences_from_columns(data, layer_columns)
    print("Parsed sequences from columns.")
    
    # Train Word2Vec model
    model = train_word2vec(tokenized_sequences)
    print("Trained Word2Vec model.")
    
    # Aggregate embeddings
    aggregated_features = aggregate_embeddings(tokenized_sequences, model)
    print("Aggregated embeddings for each sample.")
    
    # Prepare features and target
    X, y = prepare_features_targets(aggregated_features, data)
    print(f"Prepared feature matrix with shape {X.shape} and target vector with shape {y.shape}.")
    
    # Train and evaluate models
    results = train_evaluate_models_with_grid_search(X, y)
    
    # Save results
    save_results_to_csv(results)

if __name__ == "__main__":
    main()


Loaded data with 29560 samples.
Parsed sequences from columns.
Trained Word2Vec model.
Aggregated embeddings for each sample.
The following specified molecule columns are not in the dataframe and will be skipped: ['(nan)', '(BE)', '(HDA)']
Dropped 624 samples due to NaN in target 'JV_default_PCE'.
Prepared feature matrix with shape (28936, 221) and target vector with shape (28936,).
Training RandomForest...
RandomForest Best Params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 150}
MAE: 2.8973, MSE: 14.2478, R2: 0.4960
Training GradientBoosting...
GradientBoosting Best Params: {'learning_rate': 0.1, 'max_depth': 4, 'min_samples_split': 4, 'n_estimators': 150}
MAE: 2.9656, MSE: 14.5315, R2: 0.4859
Results saved to model_results.csv
