# Train Custom Diffusion Priors

In [1]:
%load_ext autoreload
%autoreload 2


import sys
sys.path.append('../')

In [2]:
from diffusers import DDPMScheduler
import pandas as pd
import torch
from torch.utils.data import DataLoader

from Datasets import RecommenderUserSampler, EmbeddingsDataset
from grid_search import run_grid_search
from prior_models import TransformerEmbeddingDiffusionModelv2
from train_priors import train_diffusion_prior
from utils import map_embeddings_to_ratings, split_recommender_data, set_seeds

## Load Data

Load the data in its corresponding (sub)directory and map image embeddings to observations.
The data in ratings.csv will constitute our observations, and for our purposes, it will 
consist of the triplets $(U_i, S_j, I_k)$, where $U_i$ corresponds user $i$, $S_j$ encodes wheter user likes $(\text{ score}\geq 4)$ or dislikes the image $(\text{ score}< 4)$ and $I_k$ is the $k$-th image.

In [3]:
image_features = torch.load("../data/flickr/processed/ip-adapters/SD15/sd15_image_embeddings.pt", weights_only=True)
ratings_df = pd.read_csv("../data/flickr/processed/ratings.csv")
expanded_features = map_embeddings_to_ratings(image_features, ratings_df)
device = "cuda"

In [4]:
usr_threshold = 100

liked_counts = (
    ratings_df[ratings_df["score"] >= 4]
    .groupby("worker_id")["score"]
    .count()
    .reset_index(name="liked_count")
)
valid_users = liked_counts[liked_counts["liked_count"] >= usr_threshold]["worker_id"].unique()
valid_worker_id = liked_counts[liked_counts["liked_count"] >= usr_threshold]["worker_id"].unique()
filtered_ratings_df = ratings_df[ratings_df["worker_id"].isin(valid_users)].copy()
print(f"User loss: {210-len(valid_users)}")
print(f"Data loss: {100*(1 - filtered_ratings_df.shape[0]/ratings_df.shape[0])}%")

User loss: 116
Data loss: 7.281789573930686%


In [5]:
worker_mapping = {old_id: new_id for new_id, old_id in enumerate(valid_worker_id)}
filtered_ratings_df.rename(columns={"worker_id": "old_worker_id"}, inplace=True)
filtered_ratings_df["worker_id"] = filtered_ratings_df["old_worker_id"].map(worker_mapping)
#filtered_ratings_df = filtered_ratings_df.reset_index(drop=True)
worker_mapping_df = pd.DataFrame(list(worker_mapping.items()), columns=["old_worker_id", "worker_id"])
worker_mapping_df.to_csv("../data/flickr/processed/worker_id_mapping_usrthr_100.csv", index=False)
filtered_ratings_df.to_csv("../data/flickr/processed/filtered_ratings_df_usrthrs_100.csv", index=False)

In [6]:
train_df, val_df, test_df = split_recommender_data(
    ratings_df=filtered_ratings_df,
    val_spu=10,
    test_spu=10,
    seed=42
)

Train set size: 177278
Validation set size: 928
Evaluation set size: 933


In [7]:
train_df['worker_id'].value_counts(ascending=True)


worker_id
40      201
52      208
36      208
72      210
67      258
      ...  
49     8064
20    11064
22    11343
87    17320
28    17875
Name: count, Length: 94, dtype: int64

In [8]:
train_df.to_csv("../data/flickr/processed/train_usrthrs_100.csv", index=False)
val_df.to_csv("../data/flickr/processed/validation_usrthrs_100.csv", index=False)
test_df.to_csv("../data/flickr/processed/test_usrthrs_100.csv", index=False)



torch.save(expanded_features[train_df.original_index], "../data/flickr/processed/train/train_ie_usrthrs_100.pth")
torch.save(expanded_features[val_df.original_index], "../data/flickr/processed/train/validation_ie_usrthrs_100.pth")
torch.save(expanded_features[test_df.original_index], "../data/flickr/processed/test/test_ie_usrthrs_100.pth")


In [9]:
expanded_features[train_df.original_index].shape

torch.Size([177278, 1024])

In [10]:
train_dataset = EmbeddingsDataset(
    train_df,
    image_embeddings=expanded_features[train_df.original_index]
)

val_dataset = EmbeddingsDataset(
    val_df,
    image_embeddings=expanded_features[val_df.original_index]
)

In [11]:
diffusion_prior_model = TransformerEmbeddingDiffusionModelv2(
    img_embed_dim=1024,
    num_users=94,    # So user embedding covers your entire user set
    n_heads=16,
    num_tokens=1,
    num_user_tokens=4,
    num_layers=8,
    dim_feedforward=2048,
    whether_use_user_embeddings=True
).to(device)



In [12]:
set_seeds(0)
batch_size = 64
samples_per_user = 200
learning_rate = 1e-4
unique_users = filtered_ratings_df["worker_id"].unique()
train_user_sampler = RecommenderUserSampler(train_df, num_users=len(unique_users), samples_per_user=samples_per_user)

train_dataloader = DataLoader(train_dataset, sampler=train_user_sampler, batch_size=batch_size)
test_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

diffusion_optimizer = torch.optim.AdamW(diffusion_prior_model.parameters(), lr=learning_rate, weight_decay=1e-5)
noise_scheduler = DDPMScheduler(num_train_timesteps=6000)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(diffusion_optimizer, 'min', patience=5, factor=0.5)

total_params = sum(p.numel() for p in diffusion_prior_model.parameters())
trainable_params = sum(p.numel() for p in diffusion_prior_model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")


savepath = f"../data/flickr/evaluation/diffusion_priors/sd15_ied1024_nu94_nh16_nit1_nut4_nl8_dff2048.pth"


Total parameters: 68641792
Trainable parameters: 68641792


In [15]:
train_loss, val_loss = train_diffusion_prior(
                model=diffusion_prior_model,
                noise_scheduler=noise_scheduler,
                train_dataloader=train_dataloader,
                val_dataloader=test_dataloader,
                optimizer=diffusion_optimizer,
                scheduler=scheduler,
                num_unique_users=len(unique_users),
                objective="noise-pred",
                device=device,
                num_epochs=2001,      # Ensure config.num_epochs is defined
                patience=20,
                savepath=savepath,
                return_losses=True,
                verbose=True
            )

Epoch 1/2001, Time Elapsed: 7.30s, Train Loss: 1.3213, Val Loss: 0.9947, Grad Norm: 4.3423
Epoch 2/2001, Time Elapsed: 13.89s, Train Loss: 0.8492, Val Loss: 0.6798, Grad Norm: 2.5805
Epoch 3/2001, Time Elapsed: 20.82s, Train Loss: 0.6410, Val Loss: 0.5304, Grad Norm: 2.2285
Epoch 4/2001, Time Elapsed: 27.84s, Train Loss: 0.5227, Val Loss: 0.4157, Grad Norm: 2.0213
Epoch 5/2001, Time Elapsed: 34.87s, Train Loss: 0.4362, Val Loss: 0.3372, Grad Norm: 1.8596
Epoch 6/2001, Time Elapsed: 41.93s, Train Loss: 0.3761, Val Loss: 0.2845, Grad Norm: 1.7472
Epoch 7/2001, Time Elapsed: 48.97s, Train Loss: 0.3249, Val Loss: 0.2391, Grad Norm: 1.6489
Epoch 8/2001, Time Elapsed: 56.03s, Train Loss: 0.2869, Val Loss: 0.2036, Grad Norm: 1.5609
Epoch 9/2001, Time Elapsed: 62.99s, Train Loss: 0.2580, Val Loss: 0.1760, Grad Norm: 1.4794
Epoch 10/2001, Time Elapsed: 70.01s, Train Loss: 0.2323, Val Loss: 0.1686, Grad Norm: 1.4104
Epoch 11/2001, Time Elapsed: 77.03s, Train Loss: 0.2112, Val Loss: 0.1604, Grad 

## Or we may run large-scale experiments

In [16]:
param_grid = {
    'timesteps': [6000],
    'layers': [8],
    'heads': [16],
    'dim_feedforward':[2048],
    'num_image_tokens': [1],
    'num_user_tokens': [4],
    'learning_rate': [1e-4],
    #'optimizers': ['adamw', 'sgd'],
    'optimizers': ['adamw'],
    #'schedulers': ['reduce_on_plateau', 'cosine'],
    'schedulers': ['reduce_on_plateau'],
    'batch_size': [64],
    'noise_schedule': [ "linear"],
    'samples_per_user': [200],
    'clip_sample': [True, False],
    'rescale_betas': [True, False],
    'objective':["noise-pred"],
    'use_ue': [True],
    'img_embed_dim': [1024]
}

savedir = "../data/flickr/evaluation/diffusion_priors/models/weights/experiment_2"
#savedir = "../data/flickr/evaluation/diffusion_priors/models/weights/experiment_2"

In [14]:
import os
os.path.exists("../data/flickr/evaluation/diffusion_priors/models/weights/experiment_1")

True

In [17]:
run_grid_search(
    train_df=train_df,
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    param_grid=param_grid,
    savedir=savedir,
    unique_users = len(unique_users)
                )



Running configuration: timesteps=6000, layers=8, heads=16, image_tokens=1, user_tokens=4, learning_rate=0.0001, clip_sample=True, rescale_betas=True, optimizer=adamw, scheduler=reduce_on_plateau, batch_size=64, noise_schedule=linear, samples_per_user=200, objective=noise-pred, use_ue=True


Hyperparameter combinations:  25%|██▌       | 1/4 [33:33<1:40:40, 2013.42s/it]

Early stopping with best val loss: 0.04021679734190305!
Running configuration: timesteps=6000, layers=8, heads=16, image_tokens=1, user_tokens=4, learning_rate=0.0001, clip_sample=True, rescale_betas=False, optimizer=adamw, scheduler=reduce_on_plateau, batch_size=64, noise_schedule=linear, samples_per_user=200, objective=noise-pred, use_ue=True


Hyperparameter combinations:  50%|█████     | 2/4 [1:12:26<1:13:23, 2201.67s/it]

Early stopping with best val loss: 0.04478246457874775!
Running configuration: timesteps=6000, layers=8, heads=16, image_tokens=1, user_tokens=4, learning_rate=0.0001, clip_sample=False, rescale_betas=True, optimizer=adamw, scheduler=reduce_on_plateau, batch_size=64, noise_schedule=linear, samples_per_user=200, objective=noise-pred, use_ue=True


Hyperparameter combinations:  75%|███████▌  | 3/4 [1:38:36<31:52, 1912.95s/it]  

Early stopping with best val loss: 0.0462913886954387!
Running configuration: timesteps=6000, layers=8, heads=16, image_tokens=1, user_tokens=4, learning_rate=0.0001, clip_sample=False, rescale_betas=False, optimizer=adamw, scheduler=reduce_on_plateau, batch_size=64, noise_schedule=linear, samples_per_user=200, objective=noise-pred, use_ue=True


Hyperparameter combinations: 100%|██████████| 4/4 [1:57:17<00:00, 1759.27s/it]

Early stopping with best val loss: 0.04166508552928765!
Experimentation complete. Results saved to results.csv at ../data/flickr/evaluation/diffusion_priors/models/weights/experiment_2



