In [1]:
# Block 1: Mount Drive and Load All Processed Features

from google.colab import drive
from scipy.sparse import vstack, load_npz, csr_matrix
import numpy as np
import glob
import os
from tqdm import tqdm

# --- Mount Google Drive ---
print("--> Mounting Google Drive...")
drive.mount('/content/drive')
print("--> Google Drive mounted successfully!")


# --- Load All Saved Feature Chunks from Google Drive ---
FEATURES_DIR = '/content/drive/MyDrive/ML_Competition/processed_features'
all_X = []
all_y = []
feature_files = sorted(glob.glob(os.path.join(FEATURES_DIR, "*.npz")))

if not feature_files:
    print("CRITICAL ERROR: No feature files found. Check the FEATURES_DIR path.")
else:
    print(f"\n--> Loading {len(feature_files)} saved feature chunks from Google Drive...")
    for filename in tqdm(feature_files):
        with np.load(filename, allow_pickle=True) as loaded:
            X_chunk = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape'])
            all_X.append(X_chunk)
            all_y.append(loaded['labels'])

    # --- Combine all chunks into one final dataset ---
    X_final_features = vstack(all_X)
    y_final = np.concatenate(all_y)

    print("\n--> All features loaded and combined successfully!")
    print(f"--> Final feature matrix shape: {X_final_features.shape}")
    print(f"--> Final labels array shape: {y_final.shape}")

--> Mounting Google Drive...
Mounted at /content/drive
--> Google Drive mounted successfully!

--> Loading 30 saved feature chunks from Google Drive...


100%|██████████| 30/30 [00:43<00:00,  1.44s/it]



--> All features loaded and combined successfully!
--> Final feature matrix shape: (75000, 22049)
--> Final labels array shape: (75000,)


In [2]:
# Block 2: Train Your Final Model with Early Stopping

import lightgbm as lgb
from sklearn.model_selection import train_test_split
import joblib

if 'X_final_features' not in locals():
    print("Please run Block 1 first to load the data.")
else:
    # --- Create a single Train/Validation split ---
    # We'll use 80% for training, 20% for validation and early stopping
    X_train, X_val, y_train, y_val = train_test_split(
        X_final_features, y_final, test_size=0.2, random_state=42
    )

    # --- Define a strong set of parameters ---
    # These are good, robust parameters that don't need tuning in a time crunch.
    params = {
        'objective': 'regression_l1', # MAE is robust
        'metric': 'rmse',
        'n_estimators': 2000, # Train up to 2000 trees...
        'learning_rate': 0.02,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'num_leaves': 40,
        'verbose': -1,
        'n_jobs': -1,
        'seed': 42,
        'boosting_type': 'gbdt',
    }

    print("\n--> Training a single, powerful model with early stopping...")
    # ...but stop automatically if the score on the validation set doesn't improve for 100 rounds.
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              callbacks=[lgb.early_stopping(100, verbose=True)])

    # --- Save your precious model immediately! ---
    MODEL_SAVE_PATH = '/content/drive/MyDrive/ML_Competition/final_model_emergency.joblib'
    joblib.dump(model, MODEL_SAVE_PATH)

    print("\n\n*** YOUR COMPETITION MODEL IS TRAINED AND SAVED! ***")
    print(f"--> Final model saved to: {MODEL_SAVE_PATH}")


--> Training a single, powerful model with early stopping...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[2000]	valid_0's rmse: 0.67971


*** YOUR COMPETITION MODEL IS TRAINED AND SAVED! ***
--> Final model saved to: /content/drive/MyDrive/ML_Competition/final_model_emergency.joblib


In [4]:
# Block 3 (Revised): The Final Submission Pipeline

import pandas as pd
import numpy as np
import os
import re
import gc
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from io import BytesIO
import requests
from tqdm import tqdm
import joblib

# --- STEP 0: RE-DEFINE ALL NECESSARY FUNCTIONS AND OBJECTS ---
# This step is crucial because we are in a new notebook.

print("--> Step 0: Defining all necessary functions and loading models...")

def extract_pack_quantity(text):
    if not isinstance(text, str): return 1.0
    match = re.search(r'Value:\s*([\d.]+)', text)
    return float(match.group(1)) if match else 1.0

def get_image_features(image_path, model):
    try:
        img = Image.open(image_path).convert('RGB')
        img_t = preprocess(img)
        batch_t = torch.unsqueeze(img_t, 0)
        with torch.no_grad(): features = model(batch_t)
        return features.squeeze().numpy()
    except Exception as e:
        return np.zeros(2048)

# --- Re-create and fit the Vectorizer ---
# We MUST use the same vocabulary, so we fit it on the first chunk of the TRAIN data again.
print("--> Fitting the TF-IDF vectorizer on a sample of the training data...")
TRAIN_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/train.csv'
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
temp_df = pd.read_csv(TRAIN_FILE, nrows=2500) # Using the same chunk size
temp_df['catalog_content'] = temp_df['catalog_content'].fillna('')
vectorizer.fit(temp_df['catalog_content'])
del temp_df
print("--> Vectorizer is ready.")


# --- Re-load the ResNet model ---
print("--> Loading the pre-trained ResNet-50 model...")
model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
model = torch.nn.Sequential(*(list(model.children())[:-1]))
model.eval()
preprocess = transforms.Compose([
    transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


# --- STEP 1: LOAD YOUR TRAINED MODEL ---
print("\n--> Step 1: Loading your saved final model from Google Drive...")
MODEL_PATH = '/content/drive/MyDrive/ML_Competition/final_model_emergency.joblib'
final_model = joblib.load(MODEL_PATH)
print("--> Model loaded successfully!")


# --- STEP 2: PROCESS THE TEST DATA ---
# !!! UPDATE THIS PATH IF YOUR TEST FILE IS IN A DIFFERENT LOCATION !!!
TEST_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/test.csv'
TEST_IMAGE_DIR = '/content/drive/MyDrive/ML_Competition/full_test_images'
os.makedirs(TEST_IMAGE_DIR, exist_ok=True)

df_test = pd.read_csv(TEST_FILE)
print(f"\n--> Step 2: Processing {len(df_test)} rows from the test set...")

# A) Text and Engineered Features
df_test['item_pack_quantity'] = df_test['catalog_content'].apply(extract_pack_quantity)
df_test['catalog_content'] = df_test['catalog_content'].fillna('')
test_text_features = vectorizer.transform(df_test['catalog_content'])
test_engineered_features = csr_matrix(df_test[['item_pack_quantity']].values)

# B) Image Features
print(f"--> Downloading/Verifying {len(df_test)} TEST images...")
for _, row in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    image_path = os.path.join(TEST_IMAGE_DIR, f"{row['sample_id']}.jpg")
    if not os.path.exists(image_path):
        try:
            response = requests.get(row['image_link'], timeout=20)
            if response.status_code == 200:
                Image.open(BytesIO(response.content)).convert('RGB').save(image_path)
        except Exception: pass

print(f"--> Extracting features from TEST images...")
test_image_features_list = [get_image_features(os.path.join(TEST_IMAGE_DIR, f"{sid}.jpg"), model) for sid in tqdm(df_test['sample_id'])]
test_image_features = np.array(test_image_features_list)

# C) Combine all test features
X_test_final = hstack([test_text_features, test_engineered_features, csr_matrix(test_image_features)])
print(f"--> Test data processed. Final feature shape: {X_test_final.shape}")


# --- STEP 3: MAKE FINAL PREDICTIONS ---
print("\n--> Step 3: Making final predictions...")
final_log_predictions = final_model.predict(X_test_final)
final_predictions = np.expm1(final_log_predictions)
final_predictions[final_predictions < 0] = 0.01 # Ensure all prices are positive


# --- STEP 4: CREATE AND SAVE THE SUBMISSION FILE ---
print("\n--> Step 4: Creating the submission file...")
submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': final_predictions})
SUBMISSION_PATH = '/content/drive/MyDrive/ML_Competition/submission_final.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)


print("\n\n--- ALL DONE! YOUR SUBMISSION IS READY! ---")
print(f"Submission file saved to your Google Drive at: {SUBMISSION_PATH}")
print("You can now download this file and submit it to the competition.")
print("\nSubmission Preview:")
print(submission_df.head())

--> Step 0: Defining all necessary functions and loading models...
--> Fitting the TF-IDF vectorizer on a sample of the training data...
--> Vectorizer is ready.
--> Loading the pre-trained ResNet-50 model...
Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth


100%|██████████| 97.8M/97.8M [00:01<00:00, 99.6MB/s]



--> Step 1: Loading your saved final model from Google Drive...
--> Model loaded successfully!

--> Step 2: Processing 75000 rows from the test set...
--> Downloading/Verifying 75000 TEST images...


100%|██████████| 75000/75000 [3:07:36<00:00,  6.66it/s]


--> Extracting features from TEST images...


  3%|▎         | 2262/75000 [17:13<9:13:48,  2.19it/s]


KeyboardInterrupt: 

In [5]:
# Block 1 (Emergency): Load Features and Train a TEXT-ONLY Model

from google.colab import drive
from scipy.sparse import vstack, load_npz, csr_matrix
import numpy as np
import glob
import os
from tqdm import tqdm
import lightgbm as lgb
import joblib

# --- Mount Google Drive ---
print("--> Mounting Google Drive...")
drive.mount('/content/drive')

# --- Load Feature Chunks and Keep ONLY Text Features ---
FEATURES_DIR = '/content/drive/MyDrive/ML_Competition/processed_features'
all_X_text_only = []
all_y = []
feature_files = sorted(glob.glob(os.path.join(FEATURES_DIR, "*.npz")))

print(f"\n--> Loading {len(feature_files)} chunks and extracting TEXT features only...")
for filename in tqdm(feature_files):
    with np.load(filename, allow_pickle=True) as loaded:
        X_chunk = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape'])
        # THIS IS THE KEY: We slice the matrix to remove the last 2048 image feature columns
        num_text_features = X_chunk.shape[1] - 2048
        all_X_text_only.append(X_chunk[:, :num_text_features])
        all_y.append(loaded['labels'])

# --- Combine all text-only chunks ---
X_text_final = vstack(all_X_text_only)
y_text_final = np.concatenate(all_y)
print("\n--> All TEXT features loaded and combined successfully!")
print(f"--> Final text-only feature matrix shape: {X_text_final.shape}")

# --- Train a fast, new model on this data ---
params = {
    'objective': 'regression_l1', 'metric': 'rmse', 'n_estimators': 2000,
    'learning_rate': 0.02, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
    'bagging_freq': 1, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'num_leaves': 40,
    'verbose': -1, 'n_jobs': -1, 'seed': 42, 'boosting_type': 'gbdt',
}

print("\n--> Training a new TEXT-ONLY model...")
text_only_model = lgb.LGBMRegressor(**params)
# No need for early stopping here, we'll train on the full data directly for max power.
text_only_model.fit(X_text_final, y_text_final)

# --- Save this new model ---
MODEL_SAVE_PATH = '/content/drive/MyDrive/ML_Competition/text_only_model_final.joblib'
joblib.dump(text_only_model, MODEL_SAVE_PATH)

print("\n\n*** TEXT-ONLY MODEL IS TRAINED AND SAVED! ***")
print(f"--> Model saved to: {MODEL_SAVE_PATH}")

--> Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

--> Loading 30 chunks and extracting TEXT features only...


100%|██████████| 30/30 [00:47<00:00,  1.57s/it]



--> All TEXT features loaded and combined successfully!
--> Final text-only feature matrix shape: (75000, 20001)

--> Training a new TEXT-ONLY model...


*** TEXT-ONLY MODEL IS TRAINED AND SAVED! ***
--> Model saved to: /content/drive/MyDrive/ML_Competition/text_only_model_final.joblib


In [6]:
# Block 2 (Emergency): Generate TEXT-ONLY Submission

import pandas as pd
import numpy as np
import os
import re
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import joblib

# --- Redefine functions and objects needed for text processing ---
def extract_pack_quantity(text):
    if not isinstance(text, str): return 1.0
    match = re.search(r'Value:\s*([\d.]+)', text)
    return float(match.group(1)) if match else 1.0

TRAIN_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/train.csv'
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
temp_df = pd.read_csv(TRAIN_FILE, nrows=2500)
temp_df['catalog_content'] = temp_df['catalog_content'].fillna('')
vectorizer.fit(temp_df['catalog_content'])
del temp_df

# --- Load your TEXT-ONLY trained model ---
print("--> Loading your saved text-only model...")
MODEL_PATH = '/content/drive/MyDrive/ML_Competition/text_only_model_final.joblib'
text_only_model = joblib.load(MODEL_PATH)

# --- Process Test Data (TEXT ONLY) ---
TEST_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/test.csv'
df_test = pd.read_csv(TEST_FILE)
print(f"\n--> Applying TEXT-ONLY feature engineering to {len(df_test)} test samples...")
df_test['item_pack_quantity'] = df_test['catalog_content'].apply(extract_pack_quantity)
df_test['catalog_content'] = df_test['catalog_content'].fillna('')
test_text_features = vectorizer.transform(df_test['catalog_content'])
test_engineered_features = csr_matrix(df_test[['item_pack_quantity']].values)

X_test_text_final = hstack([test_text_features, test_engineered_features])

# --- Make Final Predictions ---
print("\n--> Making final predictions with text-only model...")
final_log_predictions = text_only_model.predict(X_test_text_final)
final_predictions = np.expm1(final_log_predictions)
final_predictions[final_predictions < 0] = 0.01

# --- Create and Save the Submission File ---
submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': final_predictions})
SUBMISSION_PATH = '/content/drive/MyDrive/ML_Competition/submission_EMERGENCY_TEXT_ONLY.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)

print("\n\n--- SUBMISSION READY! ---")
print(f"Submission file saved to: {SUBMISSION_PATH}")
print("Download this from your Drive and submit it NOW.")
print("\nSubmission Preview:")
print(submission_df.head())

--> Loading your saved text-only model...

--> Applying TEXT-ONLY feature engineering to 75000 test samples...

--> Making final predictions with text-only model...






--- SUBMISSION READY! ---
Submission file saved to: /content/drive/MyDrive/ML_Competition/submission_EMERGENCY_TEXT_ONLY.csv
Download this from your Drive and submit it NOW.

Submission Preview:
   sample_id      price
0     100179  13.698351
1     245611  15.389302
2     146263  17.813196
3      95658  10.845611
4      36806  27.073519
