# Embeddings of All Models
Here we create UMAP and DensMAP embeddings out of model predictions. Each model performs inference on the training data `train_data` and a subset of the training data, `train_20_split`. Figure outputs can be found at https://github.com/faris-k/self-supervised-wafermaps/tree/master/reports/figures/UMAP. The cell outputs have been cleared to reduce file size.

In [125]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import pytorch_lightning as pl
import seaborn as sns
import torch
import umap
from lightly.data import LightlyDataset
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader

from ssl_wafermap.data import WaferMapDataset
from ssl_wafermap.models.knn import (
    BYOL,
    DCLW,
    DINO,
    MAE,
    MSN,
    PMSN,
    BarlowTwins,
    DINOViT,
    FastSiam,
    MoCo,
    SimCLR,
    SimMIM,
    SimSiam,
    SwaV,
    VICReg,
)
from ssl_wafermap.transforms import get_inference_transforms
from ssl_wafermap.utilities.plotting import init_seaborn_style, matplotlibify

torch.set_float32_matmul_precision("high")
init_seaborn_style()

pl.seed_everything(42)

# Ignore pytorch lightning warning about dataloader workers
warnings.filterwarnings("ignore", message=".*does not have many workers.*")

Global seed set to 42


Forgot to capitalize `"none"` to `"None"` in the preprocessing notebook. Instead of messing with it (since it involves randomly splitting the dataset), I just change this below.

In [2]:
# Mapping for sorting the failureType columns for better visualizations
mapping = {
    "Loc": 0,
    "None": 1,
    "Scratch": 2,
    "Edge-Ring": 3,
    "Center": 4,
    "Edge-Loc": 5,
    "Random": 6,
    "Donut": 7,
    "Near-full": 8,
}

data_dir = "../data/processed/WM811K/"
for file in os.listdir(data_dir):
    filepath = os.path.join(data_dir, file)
    df = pd.read_pickle(filepath)
    if "none" in df["failureType"].unique():
        # If the failureType column has lowercase "none", change it to "None"
        df["failureType"] = df["failureType"].str.replace("none", "None")
        # Now, convert the failureType column to a categorical column
        df["failureType"] = pd.Categorical(
            df["failureType"], categories=mapping.keys(), ordered=True
        )
        df.sort_values(by="failureType", inplace=True)
        display(df)
        # Save the updated dataframe if need be
        print(f"Saving updated dataframe for {file}")
        df.to_pickle(filepath, compression="xz")
    else:
        print(f"No additional preprocessing needed for {file}")
    del df

No additional preprocessing needed for test_data.pkl.xz
No additional preprocessing needed for train_10_split.pkl.xz
No additional preprocessing needed for train_1_split.pkl.xz
No additional preprocessing needed for train_20_split.pkl.xz
No additional preprocessing needed for train_29_split.pkl.xz
No additional preprocessing needed for train_data.pkl.xz
No additional preprocessing needed for train_val_data.pkl.xz
No additional preprocessing needed for val_data.pkl.xz


In [115]:
df_full = pd.read_pickle("../data/processed/WM811K/train_data.pkl.xz")
df_subset = pd.read_pickle("../data/processed/WM811K/train_20_split.pkl.xz")

full_dataset = LightlyDataset.from_torch_dataset(
    WaferMapDataset(df_full.waferMap, df_full.failureCode),
    transform=get_inference_transforms(),
)
subset_dataset = LightlyDataset.from_torch_dataset(
    WaferMapDataset(df_subset.waferMap, df_subset.failureCode),
    transform=get_inference_transforms(),
)

full_loader = DataLoader(full_dataset, batch_size=1024, shuffle=False)
subset_loader = DataLoader(subset_dataset, batch_size=1024, shuffle=False)

In [None]:
ckpt_dir = "../models/new_knn/"
ckpt_file_end = "checkpoints/epoch=149-step=87450.ckpt"

save_path_full = "../reports/figures/UMAP/full"
save_path_subset = "../reports/figures/UMAP/subset"

os.makedirs(save_path_full, exist_ok=True)
os.makedirs(save_path_subset, exist_ok=True)

for folder in os.listdir(ckpt_dir):
    # Full path, i.e. ../models/new_knn/MAE/checkpoints/epoch=149-step=87450.ckpt
    ckpt_path = os.path.join(ckpt_dir, folder, ckpt_file_end)

    # Get model name, i.e. MAE
    model_name = ckpt_path.split("new_knn/")[-1].split("\\")[0]
    print(model_name)
    if model_name == "MAE2":
        model_name = "MAE"
    # Get the model class using the model_name
    ModelClass = getattr(sys.modules[__name__], model_name)

    # Instantiate a model of this class and load ckpt weights
    model = ModelClass().load_from_checkpoint(ckpt_path)

    trainer = pl.Trainer(
        accelerator="gpu",
        logger=False,
        inference_mode=True,
        precision="16-mixed",
        enable_progress_bar=False,
    )

    # For each dataloader,
    for loader, df in zip([subset_loader, full_loader], [df_subset, df_full]):
        # for loader, df in zip([full_loader, subset_loader], [df_full, df_subset]):
        # Perform inference
        preds = trainer.predict(model, loader)
        preds = torch.cat(preds).cpu().numpy()

        scaler = StandardScaler()
        preds = scaler.fit_transform(preds)

        # Create a UMAP embedding
        reducer = umap.UMAP(random_state=0)
        embeddings = reducer.fit_transform(preds)
        umap_df = pd.DataFrame(embeddings, columns=["umap_x", "umap_y"])

        # Create a DensMAP embedding
        reducer = umap.UMAP(random_state=0, densmap=True, dens_lambda=1)
        dense_embeddings = reducer.fit_transform(preds)
        densmap_df = pd.DataFrame(dense_embeddings, columns=["densmap_x", "densmap_y"])

        # Create a dataframe with the embeddings and the failureType
        emb_df = pd.concat([umap_df, densmap_df], axis=1)
        emb_df["failureType"] = df.failureType.values

        # Plot the embeddings side-by-side

        # Initialize subplots
        fig = make_subplots(
            rows=1,
            cols=2,
            subplot_titles=(f"{model_name} UMAP", f"{model_name} DensMAP"),
            horizontal_spacing=0.01,
        )

        # Initialize individual figures
        fig1 = px.scatter(
            emb_df,
            x="umap_x",
            y="umap_y",
            color="failureType",
        )
        fig2 = px.scatter(
            emb_df,
            x="densmap_x",
            y="densmap_y",
            color="failureType",
        )

        # To prevent the legend from showing up twice, disable it for the second plot
        for trace in fig2.data:
            trace.update(showlegend=False)
        # Now, add the traces to the figure
        for trace1, trace2 in zip(fig1.data, fig2.data):
            fig.add_trace(trace1, row=1, col=1)
            fig.add_trace(trace2, row=1, col=2)
        # Update the layout
        fig.update_layout(
            height=600,
            width=1500,
            legend_title="Failure Type",
            margin=dict(r=200, t=40, b=0, l=0),
            legend={"itemsizing": "constant"},
            font=dict(family="Arial", size=24),
            xaxis_title="",
            yaxis_title="",
            template="simple_white",
        )
        # Increase font size of subplot titles
        fig.for_each_annotation(lambda a: a.update(font=dict(family="Arial", size=26)))
        fig.update_xaxes(
            showgrid=False,
            showticklabels=False,
            ticks="",
            zeroline=False,
            showline=True,
            linewidth=2.4,
            linecolor="black",
            mirror="allticks",
        )
        fig.update_yaxes(
            showgrid=False,
            showticklabels=False,
            ticks="",
            zeroline=False,
            showline=True,
            linewidth=2.4,
            linecolor="black",
            mirror="allticks",
        )
        # If we are plotting on the full dataset, use smaller markers
        if loader == full_loader:
            fig.update_traces(
                marker=dict(
                    size=4,
                ),
            )
            fig.write_image(f"{save_path_full}/{model_name}-full.png", scale=3)
        else:
            fig.write_image(f"{save_path_subset}/{model_name}-sub.png", scale=3)
        fig.show("png")

In [None]:
ckpt_dir = "../models/new_knn/"
ckpt_file_end = "checkpoints/epoch=149-step=87450.ckpt"

save_path_full = "../reports/figures/UMAP/full"
save_path_subset = "../reports/figures/UMAP/subset"

os.makedirs(save_path_full, exist_ok=True)
os.makedirs(save_path_subset, exist_ok=True)

for folder in os.listdir(ckpt_dir):
    # Full path, i.e. ../models/new_knn/MAE/checkpoints/epoch=149-step=87450.ckpt
    ckpt_path = os.path.join(ckpt_dir, folder, ckpt_file_end)

    # Get model name, i.e. MAE
    model_name = ckpt_path.split("new_knn/")[-1].split("\\")[0]
    print(model_name)
    if model_name == "MAE2":
        model_name = "MAE"
    # Get the model class using the model_name
    ModelClass = getattr(sys.modules[__name__], model_name)

    # Instantiate a model of this class and load ckpt weights
    model = ModelClass().load_from_checkpoint(ckpt_path)

    trainer = pl.Trainer(
        accelerator="gpu",
        logger=False,
        inference_mode=True,
        precision="16-mixed",
        enable_progress_bar=False,
    )

    
    # For each dataloader,
    for loader, df in zip([subset_loader, full_loader], [df_subset, df_full]):
        # Perform inference
        preds = trainer.predict(model, loader)
        preds = torch.cat(preds).cpu().numpy()

        scaler = StandardScaler()
        preds = scaler.fit_transform(preds)
        
        for label_frac in [0.1, 0.25, 0.5, 0.75, 0.99]:
            # We want to give UMAP 10% of the labels for supervised dimensionality reduction
            # We will use the same labels for both UMAP and DensMAP
            # Select a stratified sample of 10% of the labels from df (df.failureCode)
            # Everything that isn't selected, set to -1
            labels = df.failureCode.copy()
            train, test = train_test_split(labels, train_size=label_frac, random_state=42, stratify=labels)

            # Everything in the test set is now labeled as -1
            test[:] = -1

            # Now concatenate the train and test sets and reindex such that it matches the original dataframe
            labels = pd.concat([train, test]).reindex_like(labels)
            
            # Create a UMAP embedding
            reducer = umap.UMAP(random_state=0)
            reducer.fit(preds, y=labels)
            embeddings = reducer.transform(preds)
            umap_df = pd.DataFrame(embeddings, columns=["umap_x", "umap_y"])

            # Create a DensMAP embedding
            reducer = umap.UMAP(random_state=0, densmap=True, dens_lambda=1)
            reducer.fit(preds, y=labels)
            dense_embeddings = reducer.transform(preds)
            densmap_df = pd.DataFrame(dense_embeddings, columns=["densmap_x", "densmap_y"])

            # Create a dataframe with the embeddings and the failureType
            emb_df = pd.concat([umap_df, densmap_df], axis=1)
            emb_df["failureType"] = df.failureType.values

            # Plot the embeddings side-by-side

            # Initialize subplots
            fig = make_subplots(
                rows=1,
                cols=2,
                subplot_titles=(f"{model_name} Semi-Supervised UMAP", f"{model_name} Semi-Supervised DensMAP"),
                horizontal_spacing=0.01,
            )

            # Initialize individual figures
            fig1 = px.scatter(
                emb_df,
                x="umap_x",
                y="umap_y",
                color="failureType",
            )
            fig2 = px.scatter(
                emb_df,
                x="densmap_x",
                y="densmap_y",
                color="failureType",
            )

            # To prevent the legend from showing up twice, disable it for the second plot
            for trace in fig2.data:
                trace.update(showlegend=False)
            # Now, add the traces to the figure
            for trace1, trace2 in zip(fig1.data, fig2.data):
                fig.add_trace(trace1, row=1, col=1)
                fig.add_trace(trace2, row=1, col=2)
            # Update the layout
            fig.update_layout(
                height=600,
                width=1500,
                legend_title="Failure Type",
                margin=dict(r=200, t=40, b=0, l=0),
                legend={"itemsizing": "constant"},
                font=dict(family="Arial", size=24),
                xaxis_title="",
                yaxis_title="",
                template="simple_white",
            )
            # Increase font size of subplot titles
            fig.for_each_annotation(lambda a: a.update(font=dict(family="Arial", size=26)))
            fig.update_xaxes(
                showgrid=False,
                showticklabels=False,
                ticks="",
                zeroline=False,
                showline=True,
                linewidth=2.4,
                linecolor="black",
                mirror="allticks",
            )
            fig.update_yaxes(
                showgrid=False,
                showticklabels=False,
                ticks="",
                zeroline=False,
                showline=True,
                linewidth=2.4,
                linecolor="black",
                mirror="allticks",
            )
            # If we are plotting on the full dataset, use smaller markers
            if loader == full_loader:
                fig.update_traces(
                    marker=dict(
                        size=4,
                    ),
                )
                fig.write_image(f"{save_path_full}/{model_name}-full-{label_frac}.png", scale=3)
            else:
                fig.write_image(f"{save_path_subset}/{model_name}-sub-{label_frac}.png", scale=3)
            fig.show("png")

In [None]:
os.makedirs("../data/interim/model_preds", exist_ok=True)

for folder in os.listdir(ckpt_dir):
    # Full path, i.e. ../models/new_knn/MAE/checkpoints/epoch=149-step=87450.ckpt
    ckpt_path = os.path.join(ckpt_dir, folder, ckpt_file_end)

    # Get model name, i.e. MAE
    model_name = ckpt_path.split("new_knn/")[-1].split("\\")[0]
    print(model_name)
    if model_name == "MAE2":
        model_name = "MAE"
    # Get the model class using the model_name
    ModelClass = getattr(sys.modules[__name__], model_name)

    # Instantiate a model of this class and load ckpt weights
    model = ModelClass().load_from_checkpoint(ckpt_path)

    trainer = pl.Trainer(
        accelerator="gpu",
        logger=False,
        inference_mode=True,
        precision="16-mixed",
        enable_progress_bar=False,
    )

    
    # For each dataloader,
    for loader, df in zip([subset_loader, full_loader], [df_subset, df_full]):
        # Perform inference
        preds = trainer.predict(model, loader)
        preds = torch.cat(preds).cpu().numpy()

        scaler = StandardScaler()
        preds = scaler.fit_transform(preds)
        
        preds = pd.DataFrame(preds)
        preds["waferMap"] = df.waferMap.values
        preds["failureType"] = df.failureType.values
        preds["failureCode"] = df.failureCode.values

        if loader == full_loader:
            preds.to_pickle(
                f"../data/interim/model_preds/{model_name}_preds_full.pkl.xz", compression="xz"
            )
        else:
            preds.to_pickle(
                f"../data/interim/model_preds/{model_name}_preds_subset.pkl.xz", compression="xz"
            )

In [None]:
bad_ckpt = "D:\\Documents\\GitHub\\fastsiam-wafers\\scripts\\benchmark_logs\\wafermaps\\version_34\\FastSiamSymmetrized\\checkpoints\\epoch=199-step=31000.ckpt"
model = FastSiam().load_from_checkpoint(bad_ckpt)

trainer = pl.Trainer(
    accelerator="gpu",
    logger=False,
    inference_mode=True,
    precision="16-mixed",
    enable_progress_bar=False,
)

preds = trainer.predict(model, subset_loader)
preds = torch.cat(preds).cpu().numpy()

scaler = StandardScaler()
preds = scaler.fit_transform(preds)

# Create a UMAP embedding
reducer = umap.UMAP(random_state=0)
embeddings = reducer.fit_transform(preds)
umap_df = pd.DataFrame(embeddings, columns=["umap_x", "umap_y"])

# Create a DensMAP embedding
reducer = umap.UMAP(random_state=0, densmap=True, dens_lambda=1)
dense_embeddings = reducer.fit_transform(preds)
densmap_df = pd.DataFrame(dense_embeddings, columns=["densmap_x", "densmap_y"])

# Create a dataframe with the embeddings and the failureType
emb_df = pd.concat([umap_df, densmap_df], axis=1)
emb_df["failureType"] = df_subset.failureType.values


In [None]:
# Plot the embeddings side-by-side

# Initialize subplots
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=(f"FastSiam UMAP", f"FastSiam DensMAP"),
    horizontal_spacing=0.01,
)

# Initialize individual figures
fig1 = px.scatter(
    emb_df,
    x="umap_x",
    y="umap_y",
    color="failureType",
)
fig2 = px.scatter(
    emb_df,
    x="densmap_x",
    y="densmap_y",
    color="failureType",
)

# To prevent the legend from showing up twice, disable it for the second plot
for trace in fig2.data:
    trace.update(showlegend=False)
# Now, add the traces to the figure
for trace1, trace2 in zip(fig1.data, fig2.data):
    fig.add_trace(trace1, row=1, col=1)
    fig.add_trace(trace2, row=1, col=2)
# Update the layout
fig.update_layout(
    height=600,
    width=1500,
    legend_title="Failure Type",
    margin=dict(r=200, t=40, b=0, l=0),
    legend={"itemsizing": "constant"},
    font=dict(family="Arial", size=24),
    xaxis_title="",
    yaxis_title="",
    template="simple_white",
)
# Increase font size of subplot titles
fig.for_each_annotation(lambda a: a.update(font=dict(family="Arial", size=26)))
fig.update_xaxes(
    showgrid=False,
    showticklabels=False,
    ticks="",
    zeroline=False,
    showline=True,
    linewidth=2.4,
    linecolor="black",
    mirror="allticks",
)
fig.update_yaxes(
    showgrid=False,
    showticklabels=False,
    ticks="",
    zeroline=False,
    showline=True,
    linewidth=2.4,
    linecolor="black",
    mirror="allticks",
)
fig.write_image(f"{save_path_subset}/FastSiam-collapse-old-sub.png", scale=3)
fig.show("png")

In [None]:
fig = px.density_contour(
    emb_df,
    x="umap_x",
    y="umap_y",
    color="failureType",
    marginal_x="box",
    marginal_y="violin",
    width=800,
    height=600,
)
fig.update_layout(
    legend={"itemsizing": "constant"},
)

fig.show("svg")