In [1]:
import json
import torch
from torch.utils.data import Dataset
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os

folder_path = "/content/drive/MyDrive/AI Final/"

if os.path.exists(folder_path):
    print("files :", os.listdir(folder_path))
else:
    if os.path.exists("/content/drive/MyDrive/"):
        print("\nHere is what is currently in your MyDrive:")
        print(os.listdir("/content/drive/MyDrive/"))

files : ['data_utils.py', 'run.sh', 'poshmark_scraper_selenium.py', 'requirements.txt', 'README.md', 'model_inference.py', 'poshmark_scraper.py', 'app.py', 'train_caption_res-2.csv', 'train_test_split.ipynb', 'vision_encoder_decoder-3.ipynb', 'datacleaning.ipynb', 'config.json', 'test_caption_res-2.csv', 'test_keys.txt', 'cleaned_captions.json', 'unzip_images.ipynb', 'train_keys.txt', 'generation_config.json', 'caption_cleaning.ipynb', '__pycache__', 'assets', 'final_image_processor', 'venv', 'final_tokenizer', 'DONOTUSE_captions_10k.json', 'fashion_cleaner_model', 'fashion_model_bundle.zip']


In [4]:
clean_path = "/content/drive/MyDrive/AI Final/cleaned_captions.json"
noisy_path = "/content/drive/MyDrive/AI Final/DONOTUSE_captions_10k.json"
train_keys_path = "/content/drive/MyDrive/AI Final/train_keys.txt"
test_keys_path = "/content/drive/MyDrive/AI Final/test_keys.txt"


with open(clean_path, "r") as f:
    clean_data = json.load(f)

with open(noisy_path, "r") as f:
    noisy_data = json.load(f)

with open(train_keys_path, "r") as f:
    train_keys = [line.strip() for line in f if line.strip() != ""]

with open(test_keys_path, "r") as f:
    test_keys = [line.strip() for line in f if line.strip() != ""]

valid_train_keys = [k for k in train_keys if k in clean_data and k in noisy_data]
valid_test_keys = [k for k in test_keys if k in clean_data and k in noisy_data]

print(f"train Pairs: {len(valid_train_keys)}")
print(f"Test Pairs: {len(valid_test_keys)}")

train Pairs: 7722
Test Pairs: 1931


In [5]:
import torch
from torch.utils.data import Dataset

class FashionPairDataset(Dataset):
    def __init__(self, keys, noisy_dict, clean_dict, tokenizer, max_length=128):
        self.keys = keys
        self.noisy_dict = noisy_dict
        self.clean_dict = clean_dict
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.keys)

    def __getitem__(self, idx):
        key = self.keys[idx]

        input_text = self.noisy_dict[key]

        target_text = self.clean_dict[key]

        inputs = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        targets = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs.input_ids.flatten(),
            "attention_mask": inputs.attention_mask.flatten(),
            "labels": targets.input_ids.flatten()
        }

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

train_dataset = FashionPairDataset(valid_train_keys, noisy_data, clean_data, tokenizer)
eval_dataset = FashionPairDataset(valid_test_keys, noisy_data, clean_data, tokenizer)

print("model + datasets done.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

model + datasets done.


In [7]:
import os

os.environ["WANDB_DISABLED"] = "true"


In [8]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./fashion_t5_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    dataloader_num_workers=2,         # i think 2 for now
    logging_steps=50,
    optim="adafactor",
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


  trainer = Seq2SeqTrainer(


In [9]:
trainer.train()

save_path = "/content/drive/MyDrive/AI Final/fashion_cleaner_model"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"model saved to: {save_path}")

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,0.0244,0.010287
2,0.0106,0.007059
3,0.0084,0.004235
4,0.0055,0.003672
5,0.0049,0.003412


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
  batch["labels"] = torch.tensor(b

model saved to: /content/drive/MyDrive/AI Final/fashion_cleaner_model


In [10]:
import random

sample_key = random.choice(valid_test_keys)

noisy_input = noisy_data[sample_key]
clean_target = clean_data[sample_key]

inputs = tokenizer(noisy_input, return_tensors="pt").input_ids
inputs = inputs.to(model.device)

outputs = model.generate(inputs, max_length=128)
predicted_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Image ID: {sample_key}")
print(f"noisy: {noisy_input}")
print(f"real target: {clean_target}")
print(f"model output:{predicted_text}")

Image ID: WOMEN-Blouses_Shirts-id_00003508-02_7_additional.jpg
noisy: Her tank top has no sleeves, cotton fabric and graphic patterns. There is a ring on her finger.
real target: top sleeves cotton fabric graphic patterns ring finger
model output:top sleeves cotton fabric graphic patterns ring finger


In [11]:
import torch
import torch.nn.functional as F

def get_feature_embeddings(texts, model, tokenizer, batch_size=32):

    model.eval()
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(model.device)

        with torch.no_grad():

            encoder_outputs = model.encoder(**inputs)

            embeddings = encoder_outputs.last_hidden_state.mean(dim=1)

            embeddings = F.normalize(embeddings, p=2, dim=1)
            all_embeddings.append(embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)


In [12]:
database_keys = valid_test_keys[:1000]
database_texts = [clean_data[k] for k in database_keys]

print(f"build index for {len(database_texts)}")

database_embeddings = get_feature_embeddings(database_texts, model, tokenizer)

print(f"shape: {database_embeddings.shape}")

build index for 1000
shape: torch.Size([1000, 512])


In [13]:
import random

query_idx = random.randint(0, len(database_texts) - 1)
query_text = database_texts[query_idx]
query_key = database_keys[query_idx]

query_embedding = database_embeddings[query_idx].unsqueeze(0)

scores = torch.mm(query_embedding, database_embeddings.T).squeeze(0)

top_k = 5
top_scores, top_indices = torch.topk(scores, k=top_k + 1) # +1 because  top 1 is the item itself

print(f"search query {query_text}\n")
print(f"rank similar items")

for rank, idx in enumerate(top_indices[1:], start=1):
    idx = idx.item()
    score = top_scores[rank].item()
    match_text = database_texts[idx]
    match_key = database_keys[idx]

    print(f"#{rank} (sscore: {score:.4f}): {match_text}")
    print(f" [id: {match_key}]\n")

search query upper clothing medium sleeves cotton fabric color block patterns lady ring

rank similar items
#1 (sscore: 0.9814): upper clothing long sleeves fabric graphic patterns lady ring
 [id: WOMEN-Cardigans-id_00003576-02_3_back.jpg]

#2 (sscore: 0.9789): upper clothing sleeves cotton fabric solid color patterns lady ring finger head
 [id: WOMEN-Tees_Tanks-id_00007639-04_7_additional.jpg]

#3 (sscore: 0.9746): upper clothing long sleeves fabric graphic patterns accessory wrist ring finger
 [id: WOMEN-Sweaters-id_00004454-02_7_additional.jpg]

#4 (sscore: 0.9735): upper clothing short sleeves cotton fabric graphic patterns ring finger clothing waist
 [id: WOMEN-Graphic_Tees-id_00001507-02_1_front.jpg]

#5 (sscore: 0.9725): upper clothing short sleeves cotton fabric stripe patterns ring finger accessory neck
 [id: WOMEN-Tees_Tanks-id_00003420-04_3_back.jpg]



In [14]:
import shutil
import os
from google.colab import files

folder_to_zip = "/content/drive/MyDrive/AI Final/fashion_cleaner_model"
data_file = "/content/drive/MyDrive/AI Final/cleaned_captions.json"
output_filename = "/content/fashion_model_bundle"

os.makedirs("model_bundle", exist_ok=True)

if os.path.exists(folder_to_zip):
    shutil.copytree(folder_to_zip, "model_bundle/fashion_cleaner_model")
    print("model foldr done.")
else:
    print("cannot find foldr")

if os.path.exists(data_file):
    shutil.copy(data_file, "model_bundle/cleaned_captions.json")
    print("data copied")
else:
    print("not found!")

shutil.make_archive(output_filename, 'zip', "model_bundle")

files.download(output_filename + ".zip")

print("everything done - yay")

model foldr done.
data copied


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

everything done - yay
