In [None]:
import ast

import matplotlib.pyplot as plt
import pandas as pd

from embedder import Embedder
from evaluator import Evaluator
from expresser import Expresser
from population import Index, Individual

In [None]:
expresser = Expresser()
index = Index(3)
embedder = Embedder(device="mps", batch_size=16)
evaluator = Evaluator(expresser, embedder, index)

In [None]:
results_df = pd.read_csv("evolution_log.csv")
results_df["parents"] = results_df["parents"].apply(ast.literal_eval)
results_df

In [None]:
results_df[results_df["gen"] == 1]

In [None]:
results_df.groupby("gen")["novelty_score"].mean().plot()

In [None]:
def load_all_individuals(results_df: pd.DataFrame, evaluator: Evaluator) -> list[Individual]:

    individuals = []
    for gen in range(results_df["gen"].max() + 1):
        gen_df = results_df[results_df["gen"] == gen]
        top_10 = gen_df.sort_values("novelty_score").iloc[:10]
        for _, row in top_10.iterrows():
            individual = Individual(row["cand_id"], row["parents"], row["genotype"])
            individuals.append(individual)

    # Embed candidates then put them in the index
    evaluator.prepare_candidates(individuals)
    for individual in individuals:
        if individual.embedding is not None:
            evaluator.index.add_embedding(individual)

    # Get "true" novelty scores
    evaluator.evaluate_parallel(individuals)
    return individuals

In [None]:
population = load_all_individuals(results_df, evaluator)

In [None]:
def update_df(df: pd.DataFrame, individuals: list[Individual]) -> pd.DataFrame:
    id_to_novelty = {ind.cand_id: ind.novelty_score for ind in individuals}
    id_to_phenostr = {ind.cand_id: ind.encode_phenotype() for ind in individuals}
    df["final_novelty_score"] = df["cand_id"].map(id_to_novelty)
    df["phenotype"] = df["cand_id"].map(id_to_phenostr)
    return df

results_df = update_df(results_df, population)
filtered_df = results_df[results_df["cand_id"].isin([ind.cand_id for ind in population])]
filtered_df.groupby("gen")["final_novelty_score"].mean().plot()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
pop_novelties = []
final_novelties = []
n_new = []
for gen in range(results_df["gen"].max() + 1):
    gen_df = filtered_df[filtered_df["gen"] == gen]
    gen_df = gen_df.sort_values(by="novelty_score").iloc[:10]
    pop_novelties.append(gen_df["novelty_score"].mean())
    final_novelties.append(gen_df["final_novelty_score"].mean())

    new_ids = range(gen * 10, (gen + 1) * 10)
    successful = gen_df[gen_df["cand_id"].isin(new_ids)]
    n_new.append(len(successful))

axes[0].plot(range(len(pop_novelties)), pop_novelties, label="Realtime Novelty")
axes[0].plot(range(len(pop_novelties)), final_novelties, label="Final Novelty")
axes[0].set_xlabel("Generation")
axes[0].set_ylabel("Average Cosine Similarity (k=3)")
axes[0].legend()
# Make sure x axis has ticks for each generation
axes[0].set_xticks(range(10))
axes[0].set_ylim(0, 1.1)
axes[0].set_title("Average Novelty of Population per Generation")

axes[1].bar(range(len(n_new)), n_new)
axes[1].set_xlabel("Generation")
axes[1].set_title("Number of New Individuals in Population per Generation")
axes[1].set_xticks(range(10))

# plt.savefig("gen_analysis.pdf", bbox_inches="tight", format="pdf", dpi=300)
plt.show()

In [None]:
from PIL import Image
from io import BytesIO
import base64

def decode_phenotype(phenotype_str: str) -> Image.Image:
    img_data = base64.b64decode(phenotype_str)
    img = Image.open(BytesIO(img_data))
    return img

In [None]:
n_new = []
for gen in range(results_df["gen"].max() + 1):
    gen_df = results_df[results_df["gen"] == gen]
    top_10 = gen_df.sort_values("novelty_score").iloc[:10]
    # top_10 = gen_df.iloc[:10]
    new_ids = range(gen * 10, (gen + 1) * 10)
    successful = top_10[top_10["cand_id"].isin(new_ids)]

    print("Gen:", gen)
    for img in successful["phenotype"]:
        decoded_img = decode_phenotype(img)
        plt.imshow(decoded_img)
        plt.show()
    
    n_new.append(len(successful))

plt.plot(n_new)
plt.show()

In [None]:
def show_df(axes, results_df: pd.DataFrame):
    results_df.iloc[:10]
    for ax, (_, row) in zip(axes, results_df.iterrows()):
        img = decode_phenotype(row["phenotype"])
        ax.set_title(int(row["cand_id"]))
        ax.imshow(img)
        ax.axis("off")

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()
show_df(axes, results_df[results_df["gen"] == 0])
fig.suptitle("Generation 0 Baseline")
# plt.savefig("baseline.pdf", dpi=300, bbox_inches="tight", format="pdf")
plt.show()

In [None]:
show_off = results_df[(~results_df["genotype"].isna()) & (~results_df["phenotype"].isna())]
show_off = show_off.drop_duplicates(subset="cand_id").sort_values(by="final_novelty_score", ascending=True)
print(show_off[["cand_id", "parents"]])
to_drop = [32, 10, 21]
show_off = show_off[~show_off["cand_id"].isin(to_drop)]

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
axes = axes.flatten()
show_df(axes, show_off)
fig.suptitle("Most Novel Individuals")
# plt.savefig("most_novel.pdf", dpi=300, bbox_inches="tight", format="pdf")
plt.show()

In [None]:
import sklearn
import numpy as np

tsne = sklearn.manifold.TSNE()
embeddings = np.array([ind.embedding for ind in population if ind.embedding is not None])
cand_ids = np.array([ind.cand_id for ind in population if ind.embedding is not None])
gens = cand_ids // 10
embeddings_2d = tsne.fit_transform(embeddings)

plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=gens, cmap="viridis")
plt.title("t-SNE of Candidate Embeddings Colored by Generation")
plt.colorbar(label="Generation")

# plt.savefig("tsne.pdf", bbox_inches="tight", format="pdf", dpi=300)
plt.show()
    