In [None]:
# --- Setup ---
!pip install -q sentence-transformers torchvision tqdm pandas scikit-learn

import os, io, requests
import pandas as pd
import numpy as np
import torch
from PIL import Image
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from torchvision import models, transforms
from sklearn.preprocessing import StandardScaler

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m131.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m104.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os, io, requests
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from torchvision import models, transforms
from sklearn.preprocessing import StandardScaler

# --- Device setup ---
device = (
    torch.device("cuda")
    if torch.cuda.is_available()
    else torch.device("mps")
    if torch.backends.mps.is_available()
    else torch.device("cpu")
)
print(f"Using device: {device}")

# --- Constants & I/O ---
INPUT_CSV    = 'new_data.csv'
OUTPUT_CSV   = 'new_processed.csv'
IMG_CACHE    = './_imgcache'
os.makedirs(IMG_CACHE, exist_ok=True)

# --- Load & clean ---
df = pd.read_csv(INPUT_CSV, encoding='utf-8').fillna('')
num_cols = [f'feat_{i}' for i in range(43)]
df[num_cols] = df[num_cols].replace('', np.nan)
df = df.dropna(subset=['feat_0','Album_Cover_Art','Artist_Image_Link'])
df[num_cols] = StandardScaler().fit_transform(df[num_cols])

# --- Models & transforms ---
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=str(device))
base_cnn   = models.resnet50(pretrained=True)
img_model  = torch.nn.Sequential(*list(base_cnn.children())[:-1]).to(device).eval()
img_tf = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

# --- Batch text embeddings ---
text_cols = ['Artist_Genre','Last_FM_Tags','Lyrics','Ground_Truth_Genre']
text_embs = {}
for col in text_cols:
    texts = df[col].tolist()
    embs = text_model.encode(
        texts,
        batch_size=64,
        convert_to_numpy=True,
        device=str(device),
        show_progress_bar=True
    )
    text_embs[col] = embs  # shape: (n_rows, emb_dim)

# --- Image dataset for fast loading + caching ---
class ImageDataset(Dataset):
    def __init__(self, urls, cache_dir, transform, prefix):
        self.urls      = urls
        self.cache_dir = cache_dir
        self.tf        = transform
        self.pref      = prefix

    def __len__(self):
        return len(self.urls)

    def __getitem__(self, idx):
        url   = self.urls[idx]
        fname = os.path.join(self.cache_dir, f"{self.pref}_{idx}.jpg")

        # if no URL or not an HTTP URL, make a blank image
        if not url or not url.lower().startswith("http"):
            img = Image.new('RGB', (224,224), color=(0,0,0))
        else:
            try:
                if os.path.exists(fname):
                    img = Image.open(fname).convert('RGB')
                else:
                    resp = requests.get(url, timeout=5)
                    img  = Image.open(io.BytesIO(resp.content)).convert('RGB')
                    img.save(fname, quality=85)
            except Exception:
                # on any download/IO error, fallback to blank
                img = Image.new('RGB', (224,224), color=(0,0,0))

        return self.tf(img)

# --- Batch image embeddings ---
image_embs = {}
for col in ['Album_Cover_Art','Artist_Image_Link']:
    urls    = df[col].tolist()
    ds      = ImageDataset(urls, IMG_CACHE, img_tf, prefix=col)
    loader  = DataLoader(ds, batch_size=64, num_workers=2, pin_memory=True)
    feats   = []
    with torch.no_grad():
        for batch in tqdm(loader, desc=f"Embed {col}"):
            batch = batch.to(device)
            out   = img_model(batch).squeeze(-1).squeeze(-1)
            feats.append(out.cpu().numpy())
    image_embs[col] = np.vstack(feats)  # (n_rows, 2048)

Using device: cuda


  df[num_cols] = df[num_cols].replace('', np.nan)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 216MB/s]


Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Batches:   0%|          | 0/208 [00:00<?, ?it/s]

Embed Album_Cover_Art: 100%|██████████| 208/208 [07:01<00:00,  2.03s/it]
Embed Artist_Image_Link: 100%|██████████| 208/208 [06:11<00:00,  1.79s/it]


In [None]:
# --- Before assembling ---
df.reset_index(drop=True, inplace=True)  # ensures i aligns with arr[i]

n_rows = len(df)
valid_mask = np.ones(n_rows, dtype=bool)

for name, arr in {**text_embs, **image_embs}.items():
    if len(arr) != n_rows:
        print(f"❌ Length mismatch in {name}: expected {n_rows}, got {len(arr)}")
        # Create a mask of valid rows based on actual array size
        tmp_mask = np.zeros(n_rows, dtype=bool)
        tmp_mask[:len(arr)] = True
        valid_mask &= tmp_mask

# --- Filter everything down to only rows with valid embeddings ---
df = df[valid_mask].reset_index(drop=True)
for k in text_embs:
    text_embs[k] = text_embs[k][valid_mask]
for k in image_embs:
    image_embs[k] = image_embs[k][valid_mask]

# --- Assemble final records ---
records = []
for i, row in tqdm(df.iterrows(), total=len(df), desc="Building records"):
    rec = {}

    # --- Text embeddings ---
    for col, arr in text_embs.items():
        for j, v in enumerate(arr[i]):
            rec[f'{col}_emb_{j}'] = v

    # --- Image embeddings ---
    for col, arr in image_embs.items():
        for j, v in enumerate(arr[i]):
            rec[f'{col}_emb_{j}'] = v

    # --- Numeric + metadata ---
    for c in num_cols:
        rec[c] = row[c]
    rec['Ground_Truth_Genre'] = row['Ground_Truth_Genre']
    rec['Release_Year']       = row['Release_Year']

    records.append(rec)

# --- Save ---
out_df = pd.DataFrame(records)
out_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Done! Saved to {OUTPUT_CSV}")

Building records: 100%|██████████| 13277/13277 [00:37<00:00, 357.17it/s]


✅ Done! Saved to new_processed.csv
