In [None]:
# FINAL SPRINT SCRIPT (CORRECTED v2): Run this entire block now.

import pandas as pd
import numpy as np
import os
import re
import gc
from scipy.sparse import hstack, csr_matrix, vstack  # <<< THE MISSING vstack IMPORT IS ADDED HERE
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
import joblib
import glob

# --- STEP 0: Mount Drive and Define Functions ---
from google.colab import drive
drive.mount('/content/drive')

def extract_pack_quantity(text):
    if not isinstance(text, str): return 1.0
    match = re.search(r'Value:\s*([\d.]+)', text)
    return float(match.group(1)) if match else 1.0

# --- STEP 1: Load Your FULL Training Data and Add New Features ---
print("--> Step 1: Loading all pre-processed TEXT features...")
FEATURES_DIR = '/content/drive/MyDrive/ML_Competition/processed_features'
TRAIN_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/train.csv'

all_X_text_only = []
all_y = []
feature_files = sorted(glob.glob(os.path.join(FEATURES_DIR, "*.npz")))
for filename in tqdm(feature_files):
    with np.load(filename, allow_pickle=True) as loaded:
        X_chunk = csr_matrix((loaded['data'], loaded['indices'], loaded['indptr']), shape=loaded['shape'])
        # THIS IS THE KEY: We slice the matrix to remove the last 2048 image feature columns
        num_text_features = X_chunk.shape[1] - 2048
        all_X_text_only.append(X_chunk[:, :num_text_features])
        all_y.append(loaded['labels'])

# --- Combine all text-only chunks ---
X_text_full = vstack(all_X_text_only)
y_text_full = np.concatenate(all_y)

# --- Add NEW, FAST features ---
print("\n--> Adding new text-based features...")
df_train_full = pd.read_csv(TRAIN_FILE)
df_train_full['text_length'] = df_train_full['catalog_content'].str.len().fillna(0)
df_train_full['punct_count'] = df_train_full['catalog_content'].str.count(r'[!?.,\'\"]')
# Combine old and new features
new_features = csr_matrix(df_train_full[['text_length', 'punct_count']].values)
X_text_enhanced = hstack([X_text_full, new_features])
print(f"--> Enhanced feature matrix shape: {X_text_enhanced.shape}")

# --- STEP 2: Run a HYPER-FAST Tune ---
print("\n--> Step 2: Running a very fast hyperparameter search...")
param_dist = {
    'n_estimators': randint(500, 1500),
    'learning_rate': uniform(0.01, 0.04),
    'num_leaves': randint(31, 80),
}
lgbm = lgb.LGBMRegressor(random_state=42, n_jobs=-1)
# n_iter=5 is VERY fast. It will train only 15 models total.
random_search = RandomizedSearchCV(
    lgbm, param_distributions=param_dist, n_iter=5, cv=3,
    scoring='neg_root_mean_squared_error', random_state=42, verbose=2
)
random_search.fit(X_text_enhanced, y_text_full)
print(f"\n--> Best parameters found: {random_search.best_params_}")

# --- STEP 3: Train Final Model and Generate Submission ---
print("\n--> Step 3: Training final model and creating submission...")
final_text_model = lgb.LGBMRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)
final_text_model.fit(X_text_enhanced, y_text_full)

# --- Process Test Data (Fast, Text-Only) ---
TEST_FILE = '/content/drive/MyDrive/ML DATASET/student_resource/dataset/test.csv'
df_test = pd.read_csv(TEST_FILE)
df_test['catalog_content'] = df_test['catalog_content'].fillna('')
# Re-fit vectorizer
vectorizer = TfidfVectorizer(max_features=20000, stop_words='english', ngram_range=(1, 2))
vectorizer.fit(df_train_full['catalog_content'])
# Transform test text
test_text_features = vectorizer.transform(df_test['catalog_content'])
# Add new features to test data
df_test['text_length'] = df_test['catalog_content'].str.len().fillna(0)
df_test['punct_count'] = df_test['catalog_content'].str.count(r'[!?.,\'\"]')
test_new_features = csr_matrix(df_test[['text_length', 'punct_count']].values)
# Get IPQ
df_test['item_pack_quantity'] = df_test['catalog_content'].apply(extract_pack_quantity)
test_engineered_features = csr_matrix(df_test[['item_pack_quantity']].values)
# Combine all test features
X_test_final = hstack([test_text_features, test_engineered_features, test_new_features])

# --- Predict and Save ---
final_predictions = np.expm1(final_text_model.predict(X_test_final))
final_predictions[final_predictions < 0] = 0.01
submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': final_predictions})
SUBMISSION_PATH = '/content/drive/MyDrive/ML_Competition/submission_FINAL_SPRINT.csv'
submission_df.to_csv(SUBMISSION_PATH, index=False)

print("\n\n--- FINAL SUBMISSION IS READY! ---")
print(f"File saved to: {SUBMISSION_PATH}")
print("Submit this file now. This is your best shot.")
print("\nPreview:")
print(submission_df.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
--> Step 1: Loading all pre-processed TEXT features...


100%|██████████| 30/30 [00:19<00:00,  1.57it/s]



--> Adding new text-based features...
--> Enhanced feature matrix shape: (75000, 20003)

--> Step 2: Running a very fast hyperparameter search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 18.826840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873395
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 14985
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.0249816047538945, n_estimators=1360, num_leaves=45; total time=18.5min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 16.687739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 879654
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15356
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.0249816047538945, n_estimators=1360, num_leaves=45; total time=18.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.585733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 881587
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15334
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.0249816047538945, n_estimators=1360, num_leaves=45; total time=18.8min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 15.991452 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873395
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 14985
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.03927975767245621, n_estimators=1200, num_leaves=51; total time=17.2min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 16.008143 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 879654
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15356
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.03927975767245621, n_estimators=1200, num_leaves=51; total time=17.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.020382 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 881587
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15334
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.03927975767245621, n_estimators=1200, num_leaves=51; total time=18.1min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.458927 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873395
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 14985
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.01624074561769746, n_estimators=966, num_leaves=53; total time=16.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.116205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 879654
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15356
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.01624074561769746, n_estimators=966, num_leaves=53; total time=16.8min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 16.595977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 881587
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15334
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.01624074561769746, n_estimators=966, num_leaves=53; total time=16.9min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 15.687070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873395
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 14985
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.012323344486727979, n_estimators=587, num_leaves=66; total time=12.5min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 17.540897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 879654
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15356
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.012323344486727979, n_estimators=587, num_leaves=66; total time=12.6min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 18.441141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 881587
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15334
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.012323344486727979, n_estimators=587, num_leaves=66; total time=12.7min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 15.463268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 873395
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 14985
[LightGBM] [Info] Start training from score 2.735708




[CV] END learning_rate=0.015714672716877633, n_estimators=630, num_leaves=52; total time=11.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 16.874845 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 879654
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15356
[LightGBM] [Info] Start training from score 2.739072




[CV] END learning_rate=0.015714672716877633, n_estimators=630, num_leaves=52; total time=11.4min
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 17.030473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 881587
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 15334
[LightGBM] [Info] Start training from score 2.742872




[CV] END learning_rate=0.015714672716877633, n_estimators=630, num_leaves=52; total time=11.3min
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 31.035246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1158608
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 16678
[LightGBM] [Info] Start training from score 2.739217

--> Best parameters found: {'learning_rate': np.float64(0.03927975767245621), 'n_estimators': 1200, 'num_leaves': 51}

--> Step 3: Training final model and creating submission...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 29.482708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1158608
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 16678
[LightGBM] [In





--- FINAL SUBMISSION IS READY! ---
File saved to: /content/drive/MyDrive/ML_Competition/submission_FINAL_SPRINT.csv
Submit this file now. This is your best shot.

Preview:
   sample_id      price
0     100179  14.113335
1     245611  14.733978
2     146263  13.011010
3      95658  10.297788
4      36806  10.607842
