In [2]:
from google.colab import drive

print("--> Attempting to force remount Google Drive...")
drive.mount('/content/drive', force_remount=True)
print("--> Google Drive mounted successfully!")

--> Attempting to force remount Google Drive...
Mounted at /content/drive
--> Google Drive mounted successfully!


In [9]:
# Block to re-run: Paste your CORRECT path and execute.

import pandas as pd
import numpy as np
import os
import re
import gc
from scipy.sparse import hstack, csr_matrix, save_npz
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from io import BytesIO
import requests
from tqdm import tqdm

# --- STEP 0: DEFINE ALL FUNCTIONS AND MODELS ---
print("Step 0: Defining all necessary functions and loading models...")
def extract_pack_quantity(text):
    if not isinstance(text, str): return 1.0
    match = re.search(r'Value:\s*([\d.]+)', text)
    return float(match.group(1)) if match else 1.0

def get_image_features(image_path, model):
    try:
        img = Image.open(image_path).convert('RGB')
        img_t = preprocess(img)
        batch_t = torch.unsqueeze(img_t, 0)
        with torch.no_grad(): features = model(batch_t)
        return features.squeeze().numpy()
    except Exception as e:
        return np.zeros(2048)

vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --- STEP 1: PROCESS IN CHUNKS AND SAVE TO GOOGLE DRIVE ---
print("\nStep 1: Starting the chunking process...")
CHUNK_SIZE = 2500
# !!! PASTE THE PATH YOU COPIED FROM THE FILE EXPLORER HERE !!!
TRAIN_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/train.csv'
IMAGE_DIR = '/content/drive/MyDrive/ML_Competition/full_train_images'
FEATURES_DIR = '/content/drive/MyDrive/ML_Competition/processed_features'

os.makedirs(FEATURES_DIR, exist_ok=True)
os.makedirs(IMAGE_DIR, exist_ok=True)

try:
    existing_chunks = len(os.listdir(FEATURES_DIR))
    if existing_chunks > 0:
        print(f"--> RESUMING from after chunk {existing_chunks}. Skipping {existing_chunks * CHUNKS_SIZE} rows.")
        print("--> Fitting vectorizer on first chunk to ensure consistency...")
        temp_df = pd.read_csv(TRAIN_FILE, nrows=CHUNK_SIZE)
        temp_df['catalog_content'] = temp_df['catalog_content'].fillna('')
        vectorizer.fit(temp_df['catalog_content'])
        del temp_df

        skip_rows = existing_chunks * CHUNKS_SIZE
        chunk_iterator = pd.read_csv(TRAIN_FILE, chunksize=CHUNK_SIZE, skiprows=range(1, skip_rows + 1))
    else:
        chunk_iterator = pd.read_csv(TRAIN_FILE, chunksize=CHUNK_SIZE)


    for i, df_chunk in enumerate(chunk_iterator, start=existing_chunks):
        print(f"\n--- Processing Chunk {i+1} ---")

        if i == 0 and existing_chunks == 0:
            print("--> Fitting TF-IDF on the first chunk...")
            df_chunk['catalog_content'] = df_chunk['catalog_content'].fillna('')
            vectorizer.fit(df_chunk['catalog_content'])

        df_chunk['price'] = pd.to_numeric(df_chunk['price'], errors='coerce')
        df_chunk.dropna(subset=['price'], inplace=True)
        df_chunk['log_price'] = np.log1p(df_chunk['price'])
        df_chunk['item_pack_quantity'] = df_chunk['catalog_content'].apply(extract_pack_quantity)
        df_chunk['catalog_content'] = df_chunk['catalog_content'].fillna('')
        df_chunk['item_pack_quantity'] = df_chunk['item_pack_quantity'].fillna(1)
        text_features = vectorizer.transform(df_chunk['catalog_content'])
        engineered_features = csr_matrix(df_chunk[['item_pack_quantity']].values)

        print(f"--> Downloading/Verifying {len(df_chunk)} images for chunk {i+1}...")
        for _, row in tqdm(df_chunk.iterrows(), total=df_chunk.shape[0]):
            image_path = os.path.join(IMAGE_DIR, f"{row['sample_id']}.jpg")
            if not os.path.exists(image_path):
                try:
                    response = requests.get(row['image_link'], timeout=20)
                    if response.status_code == 200:
                        Image.open(BytesIO(response.content)).convert('RGB').save(image_path)
                except Exception: pass

        print(f"--> Extracting image features for chunk {i+1}...")
        image_features_list = [get_image_features(os.path.join(IMAGE_DIR, f"{sid}.jpg"), model) for sid in tqdm(df_chunk['sample_id'])]
        image_features_chunk = np.array(image_features_list)
        X_chunk = hstack([text_features, engineered_features, csr_matrix(image_features_chunk)])
        y_chunk = df_chunk['log_price']

        chunk_filename = os.path.join(FEATURES_DIR, f"features_chunk_{i}.npz")
        np.savez_compressed(chunk_filename, data=X_chunk.data, indices=X_chunk.indices,
                 indptr=X_chunk.indptr, shape=X_chunk.shape, labels=y_chunk.values)
        print(f"--> SUCCESS: Saved chunk {i+1} to {chunk_filename}")

        del df_chunk, text_features, engineered_features, image_features_chunk, X_chunk, y_chunk
        gc.collect()

    print("\n\n*** FEATURE EXTRACTION COMPLETE! ***")

except FileNotFoundError:
    print(f"CRITICAL ERROR: The file '{TRAIN_FILE}' was not found. Please double-check the path.")
except Exception as e:
    print(f"AN UNEXPECTED ERROR OCCURRED: {e}")

Step 0: Defining all necessary functions and loading models...

Step 1: Starting the chunking process...

--- Processing Chunk 1 ---
--> Fitting TF-IDF on the first chunk...
--> Downloading/Verifying 2500 images for chunk 1...


100%|██████████| 2500/2500 [05:21<00:00,  7.77it/s]


--> Extracting image features for chunk 1...


100%|██████████| 2500/2500 [11:54<00:00,  3.50it/s]


--> SUCCESS: Saved chunk 1 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_0.npz

--- Processing Chunk 2 ---
--> Downloading/Verifying 2500 images for chunk 2...


100%|██████████| 2500/2500 [05:17<00:00,  7.86it/s]


--> Extracting image features for chunk 2...


100%|██████████| 2500/2500 [11:58<00:00,  3.48it/s]


--> SUCCESS: Saved chunk 2 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_1.npz

--- Processing Chunk 3 ---
--> Downloading/Verifying 2500 images for chunk 3...


100%|██████████| 2500/2500 [05:05<00:00,  8.18it/s]


--> Extracting image features for chunk 3...


100%|██████████| 2500/2500 [12:28<00:00,  3.34it/s]


--> SUCCESS: Saved chunk 3 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_2.npz

--- Processing Chunk 4 ---
--> Downloading/Verifying 2500 images for chunk 4...


100%|██████████| 2500/2500 [04:56<00:00,  8.43it/s]


--> Extracting image features for chunk 4...


100%|██████████| 2500/2500 [12:01<00:00,  3.47it/s]


--> SUCCESS: Saved chunk 4 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_3.npz

--- Processing Chunk 5 ---
--> Downloading/Verifying 2500 images for chunk 5...


100%|██████████| 2500/2500 [05:03<00:00,  8.23it/s]


--> Extracting image features for chunk 5...


100%|██████████| 2500/2500 [12:27<00:00,  3.35it/s]


--> SUCCESS: Saved chunk 5 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_4.npz

--- Processing Chunk 6 ---
--> Downloading/Verifying 2500 images for chunk 6...


100%|██████████| 2500/2500 [05:04<00:00,  8.20it/s]


--> Extracting image features for chunk 6...


100%|██████████| 2500/2500 [12:07<00:00,  3.44it/s]


--> SUCCESS: Saved chunk 6 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_5.npz

--- Processing Chunk 7 ---
--> Downloading/Verifying 2500 images for chunk 7...


100%|██████████| 2500/2500 [05:04<00:00,  8.21it/s]


--> Extracting image features for chunk 7...


100%|██████████| 2500/2500 [12:00<00:00,  3.47it/s]


--> SUCCESS: Saved chunk 7 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_6.npz

--- Processing Chunk 8 ---
--> Downloading/Verifying 2500 images for chunk 8...


100%|██████████| 2500/2500 [05:00<00:00,  8.33it/s]


--> Extracting image features for chunk 8...


100%|██████████| 2500/2500 [11:54<00:00,  3.50it/s]


--> SUCCESS: Saved chunk 8 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_7.npz

--- Processing Chunk 9 ---
--> Downloading/Verifying 2500 images for chunk 9...


100%|██████████| 2500/2500 [05:09<00:00,  8.07it/s]


--> Extracting image features for chunk 9...


100%|██████████| 2500/2500 [12:05<00:00,  3.45it/s]


--> SUCCESS: Saved chunk 9 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_8.npz

--- Processing Chunk 10 ---
--> Downloading/Verifying 2500 images for chunk 10...


100%|██████████| 2500/2500 [05:01<00:00,  8.28it/s]


--> Extracting image features for chunk 10...


100%|██████████| 2500/2500 [12:03<00:00,  3.45it/s]


--> SUCCESS: Saved chunk 10 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_9.npz

--- Processing Chunk 11 ---
--> Downloading/Verifying 2500 images for chunk 11...


100%|██████████| 2500/2500 [05:13<00:00,  7.97it/s]


--> Extracting image features for chunk 11...


100%|██████████| 2500/2500 [12:01<00:00,  3.47it/s]


--> SUCCESS: Saved chunk 11 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_10.npz

--- Processing Chunk 12 ---
--> Downloading/Verifying 2500 images for chunk 12...


100%|██████████| 2500/2500 [05:08<00:00,  8.10it/s]


--> Extracting image features for chunk 12...


100%|██████████| 2500/2500 [12:01<00:00,  3.46it/s]


--> SUCCESS: Saved chunk 12 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_11.npz

--- Processing Chunk 13 ---
--> Downloading/Verifying 2500 images for chunk 13...


100%|██████████| 2500/2500 [05:38<00:00,  7.39it/s]


--> Extracting image features for chunk 13...


100%|██████████| 2500/2500 [12:26<00:00,  3.35it/s]


--> SUCCESS: Saved chunk 13 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_12.npz

--- Processing Chunk 14 ---
--> Downloading/Verifying 2500 images for chunk 14...


100%|██████████| 2500/2500 [05:26<00:00,  7.65it/s]


--> Extracting image features for chunk 14...


100%|██████████| 2500/2500 [12:03<00:00,  3.46it/s]


--> SUCCESS: Saved chunk 14 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_13.npz

--- Processing Chunk 15 ---
--> Downloading/Verifying 2500 images for chunk 15...


100%|██████████| 2500/2500 [05:15<00:00,  7.94it/s]


--> Extracting image features for chunk 15...


100%|██████████| 2500/2500 [11:54<00:00,  3.50it/s]


--> SUCCESS: Saved chunk 15 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_14.npz

--- Processing Chunk 16 ---
--> Downloading/Verifying 2500 images for chunk 16...


100%|██████████| 2500/2500 [04:57<00:00,  8.39it/s]


--> Extracting image features for chunk 16...


100%|██████████| 2500/2500 [12:00<00:00,  3.47it/s]


--> SUCCESS: Saved chunk 16 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_15.npz

--- Processing Chunk 17 ---
--> Downloading/Verifying 2500 images for chunk 17...


100%|██████████| 2500/2500 [04:49<00:00,  8.62it/s]


--> Extracting image features for chunk 17...


100%|██████████| 2500/2500 [12:08<00:00,  3.43it/s]


--> SUCCESS: Saved chunk 17 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_16.npz

--- Processing Chunk 18 ---
--> Downloading/Verifying 2500 images for chunk 18...


100%|██████████| 2500/2500 [05:15<00:00,  7.93it/s]


--> Extracting image features for chunk 18...


100%|██████████| 2500/2500 [12:11<00:00,  3.42it/s]


--> SUCCESS: Saved chunk 18 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_17.npz

--- Processing Chunk 19 ---
--> Downloading/Verifying 2500 images for chunk 19...


100%|██████████| 2500/2500 [05:17<00:00,  7.88it/s]


--> Extracting image features for chunk 19...


100%|██████████| 2500/2500 [11:59<00:00,  3.48it/s]


--> SUCCESS: Saved chunk 19 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_18.npz

--- Processing Chunk 20 ---
--> Downloading/Verifying 2500 images for chunk 20...


100%|██████████| 2500/2500 [05:15<00:00,  7.93it/s]


--> Extracting image features for chunk 20...


100%|██████████| 2500/2500 [12:03<00:00,  3.46it/s]


--> SUCCESS: Saved chunk 20 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_19.npz

--- Processing Chunk 21 ---
--> Downloading/Verifying 2500 images for chunk 21...


100%|██████████| 2500/2500 [05:10<00:00,  8.05it/s]


--> Extracting image features for chunk 21...


100%|██████████| 2500/2500 [12:08<00:00,  3.43it/s]


--> SUCCESS: Saved chunk 21 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_20.npz

--- Processing Chunk 22 ---
--> Downloading/Verifying 2500 images for chunk 22...


100%|██████████| 2500/2500 [05:05<00:00,  8.18it/s]


--> Extracting image features for chunk 22...


100%|██████████| 2500/2500 [12:37<00:00,  3.30it/s]


--> SUCCESS: Saved chunk 22 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_21.npz

--- Processing Chunk 23 ---
--> Downloading/Verifying 2500 images for chunk 23...


100%|██████████| 2500/2500 [05:14<00:00,  7.94it/s]


--> Extracting image features for chunk 23...


100%|██████████| 2500/2500 [11:36<00:00,  3.59it/s]


--> SUCCESS: Saved chunk 23 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_22.npz

--- Processing Chunk 24 ---
--> Downloading/Verifying 2500 images for chunk 24...


100%|██████████| 2500/2500 [04:52<00:00,  8.53it/s]


--> Extracting image features for chunk 24...


100%|██████████| 2500/2500 [11:41<00:00,  3.56it/s]


--> SUCCESS: Saved chunk 24 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_23.npz

--- Processing Chunk 25 ---
--> Downloading/Verifying 2500 images for chunk 25...


100%|██████████| 2500/2500 [05:04<00:00,  8.21it/s]


--> Extracting image features for chunk 25...


100%|██████████| 2500/2500 [11:43<00:00,  3.55it/s]


--> SUCCESS: Saved chunk 25 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_24.npz

--- Processing Chunk 26 ---
--> Downloading/Verifying 2500 images for chunk 26...


100%|██████████| 2500/2500 [05:09<00:00,  8.08it/s]


--> Extracting image features for chunk 26...


100%|██████████| 2500/2500 [11:40<00:00,  3.57it/s]


--> SUCCESS: Saved chunk 26 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_25.npz

--- Processing Chunk 27 ---
--> Downloading/Verifying 2500 images for chunk 27...


100%|██████████| 2500/2500 [05:24<00:00,  7.71it/s]


--> Extracting image features for chunk 27...


100%|██████████| 2500/2500 [11:41<00:00,  3.56it/s]


--> SUCCESS: Saved chunk 27 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_26.npz

--- Processing Chunk 28 ---
--> Downloading/Verifying 2500 images for chunk 28...


100%|██████████| 2500/2500 [05:11<00:00,  8.04it/s]


--> Extracting image features for chunk 28...


100%|██████████| 2500/2500 [12:02<00:00,  3.46it/s]


--> SUCCESS: Saved chunk 28 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_27.npz

--- Processing Chunk 29 ---
--> Downloading/Verifying 2500 images for chunk 29...


100%|██████████| 2500/2500 [05:33<00:00,  7.49it/s]


--> Extracting image features for chunk 29...


100%|██████████| 2500/2500 [11:50<00:00,  3.52it/s]


--> SUCCESS: Saved chunk 29 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_28.npz

--- Processing Chunk 30 ---
--> Downloading/Verifying 2500 images for chunk 30...


100%|██████████| 2500/2500 [05:20<00:00,  7.81it/s]


--> Extracting image features for chunk 30...


100%|██████████| 2500/2500 [12:16<00:00,  3.39it/s]


--> SUCCESS: Saved chunk 30 to /content/drive/MyDrive/ML_Competition/processed_features/features_chunk_29.npz


*** FEATURE EXTRACTION COMPLETE! ***
