# Imports

In [None]:
import os
import random
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.datasets import SentencesDataset
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time
from transformers import get_linear_schedule_with_warmup
from transformers import AutoTokenizer

# Google Drive Setup

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
datadir = "/content/drive/My Drive/CS444/Final_Project"
os.chdir(datadir)
!pwd

/content/drive/My Drive/CS444/Final_Project


# Text Encoder Evaluation

In [None]:
# load caption texts and labels from Flowers102 directories
def load_image_caption_texts(datadir):
    caption_dir = os.path.join(datadir, 'text_c10')
    image_captions = {}
    image_labels = {}
    # classes 1..102
    for class_idx in range(1, 103):
        class_dir = os.path.join(caption_dir, f'class_{class_idx:05d}')
        for fname in os.listdir(class_dir):
            if not fname.endswith('.txt'):
                continue
            img_file = fname[:11] + '.jpg'
            # read up to 5 captions
            with open(os.path.join(class_dir, fname), 'r') as f:
                lines = [line.strip() for line in f if line.strip()]
            captions = lines[:5]
            image_captions[img_file] = captions
            image_labels[img_file] = class_idx - 1
    return image_captions, image_labels

# load data
image_captions, image_labels = load_image_caption_texts(datadir)

In [None]:
# create train / test splits
def prepare_caption_samples(image_captions, image_labels, test_size=0.25, random_state=7):
    all_images = sorted(image_captions.keys())
    all_labels = [image_labels[img] for img in all_images]
    train_imgs, test_imgs, _, _ = train_test_split(
        all_images, all_labels, test_size=test_size,
        stratify=all_labels, random_state=random_state
    )
    train_texts, train_labels = [], []
    test_texts, test_labels = [], []
    for img in train_imgs:
        for cap in image_captions[img]:
            train_texts.append(cap)
            train_labels.append(image_labels[img])
    for img in test_imgs:
        for cap in image_captions[img]:
            test_texts.append(cap)
            test_labels.append(image_labels[img])
    return train_texts, train_labels, test_texts, test_labels, train_imgs, test_imgs

# split data
train_texts, train_labels, test_texts, test_labels, train_imgs, test_imgs = prepare_caption_samples(image_captions, image_labels)

In [None]:
# test how well logistic regression can separate out embeddings by class
def classification_eval(model, train_texts, train_labels, test_texts, test_labels):
    # encode captions
    emb_train = model.encode(train_texts, convert_to_tensor=False, batch_size=64)
    emb_test = model.encode(test_texts, convert_to_tensor=False, batch_size=64)
    # fit logistic regression
    clf = LogisticRegression(max_iter=1000).fit(emb_train, train_labels)
    preds = clf.predict(emb_test)
    return accuracy_score(test_labels, preds)

# create a supervised classifier head on top of the encoder for fine-tuning
class FlowerFineTuner(nn.Module):
    def __init__(self, embed_model, num_classes=102):
        super().__init__()
        self.embed = embed_model
        D = embed_model.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(D, num_classes)
    def forward(self, texts):
        # needed to prevent no_grad
        out = self.embed.forward(texts)
        embs = out['sentence_embedding']
        return self.classifier(embs)

# create a dataset class for captions
class CaptionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts  = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return self.texts[i], self.labels[i]

# models to evaluate
model_names = [
    'sentence-transformers/all-MiniLM-L12-v2',
    'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/multi-qa-mpnet-base-dot-v1'
]

text_encoders_dir = os.path.join(datadir, 'text_encoders')

for name in model_names:
    model = SentenceTransformer(name)
    base_name = name.split('/')[-1]
    # save base model
    pretrained_path = os.path.join(text_encoders_dir, f'pretrained_{base_name}')
    model.save(pretrained_path)
    # test how well base model performs
    acc_pre = classification_eval(model, train_texts, train_labels, test_texts, test_labels)

    # create custom collate function
    # (ran into many issues using smart_batching_collate)
    tokenizer = AutoTokenizer.from_pretrained(name)

    def collate_fn(batch):
        texts, labels = zip(*batch)
        features = tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return features, torch.tensor(labels, dtype=torch.long)

    # create dataset & dataloader
    train_ds = CaptionDataset(train_texts, train_labels)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tuner = FlowerFineTuner(model, num_classes=102).to(device)

    optimizer = torch.optim.AdamW(tuner.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss()

    # fine-tune for 4 epochs
    for epoch in range(4):
        tuner.train()
        for features, labels in train_loader:
            features = {k: v.to(device) for k, v in features.items()}
            labels = labels.to(device)
            logits = tuner(features)
            loss = criterion(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # save fine-tuned model
    finetuned_path = os.path.join(text_encoders_dir, f'finetuned_{base_name}')
    # tuner training has now updated the model weights
    model.save(finetuned_path)
    # sleep to prevent issues with drive not having registered saved folder yet
    time.sleep(3)
    # load fine-tuned version
    model = SentenceTransformer(finetuned_path)
    # post-fine-tune probe (same logistic regression on the new embeddings)
    acc_post = classification_eval(model, train_texts, train_labels, test_texts, test_labels)
    print("")
    print("")
    print(f"{base_name}: Pre-transfer acc={acc_pre:.4f}, Post-transfer acc={acc_post:.4f}")
    print("")
    print("")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



all-MiniLM-L12-v2: Pre-transfer acc=0.3209, Post-transfer acc=0.3390




modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



all-mpnet-base-v2: Pre-transfer acc=0.3158, Post-transfer acc=0.3689




modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.71k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



multi-qa-mpnet-base-dot-v1: Pre-transfer acc=0.3732, Post-transfer acc=0.4085




In [None]:
# test how well logistic regression can separate out embeddings by class
def classification_eval(model, train_texts, train_labels, test_texts, test_labels):
    # encode captions
    emb_train = model.encode(train_texts, convert_to_tensor=False, batch_size=64)
    emb_test = model.encode(test_texts, convert_to_tensor=False, batch_size=64)
    # fit logistic regression
    clf = LogisticRegression(max_iter=1000).fit(emb_train, train_labels)
    preds = clf.predict(emb_test)
    return accuracy_score(test_labels, preds)

# create a supervised classifier head on top of the encoder for fine-tuning
class FlowerFineTuner(nn.Module):
    def __init__(self, embed_model, num_classes=102):
        super().__init__()
        self.embed = embed_model
        D = embed_model.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(D, num_classes)
    def forward(self, texts):
        # needed to prevent no_grad
        out = self.embed.forward(texts)
        embs = out['sentence_embedding']
        return self.classifier(embs)

# create a dataset class for captions
class CaptionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts  = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return self.texts[i], self.labels[i]

# models to evaluate
model_names = [
    'sentence-transformers/all-MiniLM-L12-v2',
    'sentence-transformers/all-mpnet-base-v2',
    'sentence-transformers/multi-qa-mpnet-base-dot-v1'
]

text_encoders_dir = os.path.join(datadir, 'text_encoders')

for name in model_names:
    model = SentenceTransformer(name)
    base_name = name.split('/')[-1]
    # save base model
    pretrained_path = os.path.join(text_encoders_dir, f'pretrained_{base_name}')
    model.save(pretrained_path)
    # test how well base model performs
    acc_pre = classification_eval(model, train_texts, train_labels, test_texts, test_labels)

    # create custom collate function
    # (ran into many issues using smart_batching_collate)
    tokenizer = AutoTokenizer.from_pretrained(name)

    def collate_fn(batch):
        texts, labels = zip(*batch)
        features = tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return features, torch.tensor(labels, dtype=torch.long)

    # create dataset & dataloader
    train_ds = CaptionDataset(train_texts, train_labels)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tuner = FlowerFineTuner(model, num_classes=102).to(device)

    optimizer = torch.optim.AdamW(tuner.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss()

    # fine-tune for 10 epochs
    for epoch in range(10):
        tuner.train()
        for features, labels in train_loader:
            features = {k: v.to(device) for k, v in features.items()}
            labels = labels.to(device)
            logits = tuner(features)
            loss = criterion(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # save fine-tuned model
    finetuned_path = os.path.join(text_encoders_dir, f'finetuned10_{base_name}')
    # tuner training has now updated the model weights
    model.save(finetuned_path)
    # sleep to prevent issues with drive not having registered saved folder yet
    time.sleep(3)
    # load fine-tuned version
    model = SentenceTransformer(finetuned_path)
    # post-fine-tune probe (same logistic regression on the new embeddings)
    acc_post = classification_eval(model, train_texts, train_labels, test_texts, test_labels)
    print("")
    print("")
    print(f"{base_name}: Pre-transfer acc={acc_pre:.4f}, Post-transfer acc={acc_post:.4f}")
    print("")
    print("")



all-MiniLM-L12-v2: Pre-transfer acc=0.3209, Post-transfer acc=0.3774




all-mpnet-base-v2: Pre-transfer acc=0.3158, Post-transfer acc=0.3955




multi-qa-mpnet-base-dot-v1: Pre-transfer acc=0.3732, Post-transfer acc=0.4227




In [None]:
# test how well logistic regression can separate out embeddings by class
def classification_eval(model, train_texts, train_labels, test_texts, test_labels):
    # encode captions
    emb_train = model.encode(train_texts, convert_to_tensor=False, batch_size=64)
    emb_test = model.encode(test_texts, convert_to_tensor=False, batch_size=64)
    # fit logistic regression
    clf = LogisticRegression(max_iter=1000).fit(emb_train, train_labels)
    preds = clf.predict(emb_test)
    return accuracy_score(test_labels, preds)

# create a supervised classifier head on top of the encoder for fine-tuning
class FlowerFineTuner(nn.Module):
    def __init__(self, embed_model, num_classes=102):
        super().__init__()
        self.embed = embed_model
        D = embed_model.get_sentence_embedding_dimension()
        self.classifier = nn.Linear(D, num_classes)
    def forward(self, texts):
        # needed to prevent no_grad
        out = self.embed.forward(texts)
        embs = out['sentence_embedding']
        return self.classifier(embs)

# create a dataset class for captions
class CaptionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts  = texts
        self.labels = labels
    def __len__(self): return len(self.texts)
    def __getitem__(self, i): return self.texts[i], self.labels[i]

# models to evaluate
model_names = [
    'sentence-transformers/multi-qa-mpnet-base-dot-v1'
]

text_encoders_dir = os.path.join(datadir, 'text_encoders')

for name in model_names:
    model = SentenceTransformer(name)
    base_name = name.split('/')[-1]
    # save base model
    pretrained_path = os.path.join(text_encoders_dir, f'pretrained_{base_name}')
    model.save(pretrained_path)
    # test how well base model performs
    # acc_pre = classification_eval(model, train_texts, train_labels, test_texts, test_labels)
    # manually enter base accuracy to save on runtime
    acc_pre = 0.3732

    # create custom collate function
    # (ran into many issues using smart_batching_collate)
    tokenizer = AutoTokenizer.from_pretrained(name)

    def collate_fn(batch):
        texts, labels = zip(*batch)
        features = tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors='pt'
        )
        return features, torch.tensor(labels, dtype=torch.long)

    # create dataset & dataloader
    train_ds = CaptionDataset(train_texts, train_labels)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tuner = FlowerFineTuner(model, num_classes=102).to(device)

    optimizer = torch.optim.AdamW(tuner.parameters(), lr=1e-5)
    criterion = nn.CrossEntropyLoss()

    # fine-tune for 30 epochs
    for epoch in range(30):
        tuner.train()
        for features, labels in train_loader:
            features = {k: v.to(device) for k, v in features.items()}
            labels = labels.to(device)
            logits = tuner(features)
            loss = criterion(logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # save fine-tuned model
    finetuned_path = os.path.join(text_encoders_dir, f'finetuned30_{base_name}')
    # tuner training has now updated the model weights
    model.save(finetuned_path)
    # sleep to prevent issues with drive not having registered saved folder yet
    time.sleep(3)
    # load fine-tuned version
    model = SentenceTransformer(finetuned_path)
    # post-fine-tune probe (same logistic regression on the new embeddings)
    acc_post = classification_eval(model, train_texts, train_labels, test_texts, test_labels)
    print("")
    print("")
    print(f"{base_name}: Pre-transfer acc={acc_pre:.4f}, Post-transfer acc={acc_post:.4f}")
    print("")
    print("")



multi-qa-mpnet-base-dot-v1: Pre-transfer acc=0.3732, Post-transfer acc=0.4041


