In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score

In [None]:
!ls -R /content

/content:
sample_data

/content/sample_data:
anscombe.json		      mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  README.md


# BASELINE MODEL

In [None]:
df = pd.read_csv("OPV_dataset_quiz_training.csv")
#df = pd.read_csv("/content/sample_data/OPV_dataset_quiz_training.csv")

target_col = "PCE"
df = df.dropna(subset=[target_col])

X = df.drop(columns=[target_col]).select_dtypes(include=[np.number])
y = df[target_col]

# preprocessing stuff
print("Missing values per feature BEFORE cleaning:")
print(X.isna().sum())

# 1. Drop rows where ANY feature value is NaN
clean_df = df.dropna(axis=0)   # keeps only fully complete rows

# 2. Recompute X and y so all downstream code stays identical
X = clean_df.drop(columns=[target_col]).select_dtypes(include=[np.number])
y = clean_df[target_col]

print("\nAFTER CLEANING SHAPES:")
print("X:", X.shape)
print("y:", y.shape)

# 3. 5-Fold Cross Validation with baseline RF
rf_cv = RandomForestRegressor(
    n_estimators=100,
    random_state=0,
    n_jobs=-1
)

cv = KFold(n_splits=5, shuffle=True, random_state=42)

neg_rmse_scores = cross_val_score(
    rf_cv,
    X, y,
    cv=cv,
    scoring="neg_root_mean_squared_error"
)
rmse_scores = -neg_rmse_scores

r2_scores = cross_val_score(
    rf_cv,
    X, y,
    cv=cv,
    scoring="r2"
)

print("\n5-fold Cross-Validation (Baseline RF)")
print(f"RMSE (mean ± std): {rmse_scores.mean():.4f} ± {rmse_scores.std():.4f}")
print(f"R^2  (mean ± std): {r2_scores.mean():.4f} ± {r2_scores.std():.4f}")
#end of preprocessing stuff

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0
)

rf = RandomForestRegressor(
    n_estimators=100,
    random_state=0,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
r2_val = r2_score(y_val, y_pred)
rmse_val = mean_squared_error(y_val, y_pred) ** 0.5

print(f"Train size: {X_train.shape[0]}")
print(f"Validation size: {X_val.shape[0]}")
print(f"Validation R^2: {r2_val:.4f}")
print(f"Validation RMSE: {rmse_val:.4f}")


Missing values per feature BEFORE cleaning:
ID No.                0
Voc (V)               2
Jsc (mA cm^2)         0
FF                    1
Mw (kg mol^-1)        1
Mn (kg mol^-1)        1
PDI (=Mw/Mn)          1
Monomer (g mol^-1)    0
-HOMO (eV)            0
-LUMO (eV)            1
bandgap(eV)           1
dtype: int64

AFTER CLEANING SHAPES:
X: (960, 11)
y: (960,)

===== 5-fold Cross-Validation (Baseline RF) =====
RMSE (mean ± std): 0.3598 ± 0.0318
R^2  (mean ± std): 0.9761 ± 0.0041
Train size: 768
Validation size: 192
Validation R^2: 0.9821
Validation RMSE: 0.3172


GENETIC ALGORITHM TUNING

In [None]:
!pip install rdkit

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import re
from collections import Counter
import numpy as np
from itertools import combinations
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression, Ridge
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, KFold, cross_val_score, LeaveOneOut, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, StackingRegressor
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import ConfusionMatrixDisplay
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.initializers import Zeros, RandomUniform, GlorotUniform
from tensorflow.keras.utils import to_categorical
import shap
import random



In [None]:
# Genetic Algorithm Tuning for RandomForestRegressor

train_df = pd.read_csv("/content/sample_data/OPV_dataset_quiz_training.csv")
test_df  = pd.read_csv("/content/sample_data/OPV_dataset_quiz_test.csv")

#train_df = pd.read_csv("/content/OPV_dataset_quiz_training.csv")
#test_df  = pd.read_csv("/content/OPV_dataset_quiz_test.csv")

target_col = "PCE"
train_df = train_df.dropna(subset=[target_col])

X = train_df.drop(columns=[target_col]).select_dtypes(include=[np.number])
y = train_df[target_col]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Corrected Hyperparameter Space
param_space = {
    "n_estimators": [50, 100, 200, 300, 400],
    "max_depth": [None, 5, 10, 20, 40],
    "min_samples_split": [2, 4, 6, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None]
}

# Random individual generator
def random_individual():
    return {
        "n_estimators": random.choice(param_space["n_estimators"]),
        "max_depth": random.choice(param_space["max_depth"]),
        "min_samples_split": random.choice(param_space["min_samples_split"]),
        "min_samples_leaf": random.choice(param_space["min_samples_leaf"]),
        "max_features": random.choice(param_space["max_features"])
    }

# Fitness = RMSE
def fitness(individual):
    model = RandomForestRegressor(
        n_estimators=individual["n_estimators"],
        max_depth=individual["max_depth"],
        min_samples_split=individual["min_samples_split"],
        min_samples_leaf=individual["min_samples_leaf"],
        max_features=individual["max_features"],
        random_state=0,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))  # Fixed
    return rmse

# Crossover
def crossover(p1, p2):
    return {k: random.choice([p1[k], p2[k]]) for k in p1}

# Mutation
def mutate(individual, mutation_rate=0.2):
    for key in individual:
        if random.random() < mutation_rate:
            individual[key] = random.choice(param_space[key])
    return individual


# Genetic Algorithm Loop
population_size = 20
generations = 10
population = [random_individual() for _ in range(population_size)]
ga_best_rmse_history = [] #added this to store the best RMSE per generation in a list - hita
for gen in range(generations):
    print(f"\n Generation {gen+1}")

    scored_pop = []
    for indiv in population:
        rmse = fitness(indiv)
        scored_pop.append((rmse, indiv))

    scored_pop.sort(key=lambda x: x[0])

    best_rmse_this_gen = scored_pop[0][0] #added this - hita
    ga_best_rmse_history.append(best_rmse_this_gen) #added this - hita

    print(f"Best RMSE this generation: {best_rmse_this_gen:.4f}")

    survivors = [ind for (_, ind) in scored_pop[:population_size // 2]]
    new_population = survivors.copy()
    while len(new_population) < population_size:
        parents = random.sample(survivors, 2)
        child = crossover(parents[0], parents[1])
        child = mutate(child)
        new_population.append(child)
    population = new_population

best_rmse, best_params = scored_pop[0]
print("\nBest Hyperparameters Found:")
print(best_params)
print(f"\nBest Validation RMSE: {best_rmse:.4f}")

# Train final model
final_model = RandomForestRegressor(
    **best_params,
    random_state=0,
    n_jobs=-1
)
final_model.fit(X, y)

# Make sure test features match training features
X_test = test_df.drop(columns=[target_col], errors='ignore').select_dtypes(include=[np.number])
test_predictions = final_model.predict(X_test)
print("\nFinal GA-Optimized RandomForest Model Trained.")

# Save predictions
submission = pd.DataFrame({
    "ID": test_df.index,
    "Predicted_PCE": test_predictions
})
submission.to_csv("GA_RF_predictions.csv", index=False)
print("Saved GA_RF_predictions.csv")


=== Generation 1 ===
Best RMSE this generation: 0.3509

=== Generation 2 ===
Best RMSE this generation: 0.3283

=== Generation 3 ===
Best RMSE this generation: 0.3283

=== Generation 4 ===
Best RMSE this generation: 0.3283

=== Generation 5 ===
Best RMSE this generation: 0.3283

=== Generation 6 ===
Best RMSE this generation: 0.3253

=== Generation 7 ===
Best RMSE this generation: 0.3245

=== Generation 8 ===
Best RMSE this generation: 0.3245

=== Generation 9 ===
Best RMSE this generation: 0.3244

=== Generation 10 ===
Best RMSE this generation: 0.3244

Best Hyperparameters Found:
{'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None}
Best Validation RMSE: 0.3244

Final GA-Optimized RandomForest Model Trained.
Saved GA_RF_predictions.csv


BAYESIAN OPTIMIZATION

In [None]:
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [None]:
# Define search space for Bayesian Optimization

# RandomForest hyperparameter ranges
pbounds = {
    'n_estimators': (50, 400),
    'max_depth': (5, 40),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4),
}


# Objective function: negative RMSE


def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    n_estimators = int(round(n_estimators))
    max_depth = int(round(max_depth))
    min_samples_split = int(round(min_samples_split))
    min_samples_leaf = int(round(min_samples_leaf))

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features='sqrt',
        random_state=0,
        n_jobs=-1
    )

    # 5-fold CV on training set
    scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
    return scores.mean()


# Run Bayesian Optimization


optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)

# Run 30 trials
optimizer.maximize(init_points=5, n_iter=30)

# Best hyperparameters
best_bo_params = optimizer.max['params']
# Round integer hyperparameters
best_bo_params = {
    'n_estimators': int(round(best_bo_params['n_estimators'])),
    'max_depth': int(round(best_bo_params['max_depth'])),
    'min_samples_split': int(round(best_bo_params['min_samples_split'])),
    'min_samples_leaf': int(round(best_bo_params['min_samples_leaf'])),
    'max_features': 'sqrt'
}

print("Best Bayesian Hyperparameters Found:")
print(best_bo_params)


# Train final Bayesian-optimized model


final_bo_model = RandomForestRegressor(
    **best_bo_params,
    random_state=0,
    n_jobs=-1
)

#shouldn't we be using X_train and y_train instead of X and y
#final_bo_model.fit(X, y) (this is the original line)
final_bo_model.fit(X_train, y_train) #i changed it to this - hita

# Ensure test features match training features
X_test = test_df.drop(columns=[target_col], errors='ignore').select_dtypes(include=[np.number])
bo_test_preds = final_bo_model.predict(X_test)

# Evaluate on validation set
y_val_pred = final_bo_model.predict(X_val)
r2_val = r2_score(y_val, y_val_pred)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

print(f"\nBayesian-Optimized RandomForest Performance on Validation Set:")
print(f"R² = {r2_val:.4f}")
print(f"RMSE = {rmse_val:.4f}")

# Save predictions
submission_bo = pd.DataFrame({
    "ID": test_df.index,
    "Predicted_PCE": bo_test_preds
})
submission_bo.to_csv("Bayes_RF_predictions.csv", index=False)
print("Saved Bayes_RF_predictions.csv")

COMPARE RESULTS

In [None]:
#Plot 1: GA fitness vs generation
generations_axis = range(1, len(ga_best_rmse_history) + 1)

plt.figure()
plt.plot(generations_axis, ga_best_rmse_history, marker='o')
plt.xlabel("Generation")
plt.ylabel("Best RMSE")
plt.title("Genetic Algorithm: Best RMSE per Generation")
plt.grid(True)
plt.show()

In [None]:
#Plot 2: Bayesian optimization score vs iteration
# Extract RMSE for each iteration (convert negative target -> RMSE)
bo_targets = [res["target"] for res in optimizer.res]  # negative RMSE
bo_rmse = -np.array(bo_targets)

iterations = range(1, len(bo_rmse) + 1)

plt.figure()
plt.plot(iterations, bo_rmse, marker='o')
plt.xlabel("Iteration")
plt.ylabel("RMSE (CV)")
plt.title("Bayesian Optimization: RMSE per Iteration")
plt.grid(True)
plt.show()


In [None]:
## Plot 3: Predicted vs Actual PCE (best model)
#If best model is baseline RF:
best_model = rf          # or final_ga_model or final_bo_model
y_val_pred_best = best_model.predict(X_val)

plt.figure()
plt.scatter(y_val, y_val_pred_best)
min_val = min(y_val.min(), y_val_pred_best.min())
max_val = max(y_val.max(), y_val_pred_best.max())
plt.plot([min_val, max_val], [min_val, max_val])  # y = x reference line
plt.xlabel("Actual PCE")
plt.ylabel("Predicted PCE")
plt.title("Predicted vs Actual PCE (Best Model)")
plt.grid(True)
plt.show()