In [1]:
# --- STAGE 6: THE V4 COMEBACK MODEL (PCA + MULTIMODAL) ---
# =========================================================
# This is the final assembly script. It uses the pre-computed feature files.

# --- Step 1: Connect to Drive and Set Paths ---
from google.colab import drive
import os
drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/Amazon_ML_Challenge'
# Set the working directory to our project folder
os.chdir(project_path)
print(f"Working directory set to: {os.getcwd()}")
print("-" * 50)


# --- Step 2: Import All Necessary Libraries ---
from sklearn.decomposition import PCA
from scipy.sparse import hstack
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
print("All libraries for V4 model are ready.")
print("-" * 50)


# --- Step 3: Define Helper Functions ---
def smape(y_true, y_pred):
    y_true_unlogged = np.expm1(y_true)
    y_pred_unlogged = np.expm1(y_pred)
    numerator = np.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (np.abs(y_true_unlogged) + np.abs(y_pred_unlogged)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio)

def feature_engineer(df):
    df['text_length'] = df['catalog_content'].str.len()
    def extract_ipq(text):
        if not isinstance(text, str): return 1
        text = text.lower()
        patterns = [r'pack of (\d+)', r'(\d+)\s*count', r'quantity\s*[:]*\s*(\d+)', r'(\d+)\s*pack', r'set of (\d+)', r'(\d+)\s*pk']
        for pattern in patterns:
            match = re.search(pattern, text)
            if match: return int(match.group(1))
        return 1
    df['ipq'] = df['catalog_content'].apply(extract_ipq)
    return df
print("Helper functions are defined.")
print("-" * 50)


# --- Step 4: Load All Data and Features from Drive ---
try:
    print("Loading all data sources...")
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')
    train_df['catalog_content'] = train_df['catalog_content'].fillna('')
    test_df['catalog_content'] = test_df['catalog_content'].fillna('')

    # Load our "golden files"
    train_image_features = np.load('train_image_features.npy')
    test_image_features = np.load('test_image_features.npy')
    print("Successfully loaded all text data and both train/test image features.")
except FileNotFoundError as e:
    print(f"ERROR: Could not find feature files. Please ensure they are in your Drive. Error: {e}")
print("-" * 50)


# --- Step 5: Apply PCA to Image Features ---
print("Step 5: Applying PCA to clean and compress image features...")
# We choose 128 components. This is a hyperparameter we could tune later.
N_COMPONENTS = 128
pca = PCA(n_components=N_COMPONENTS, random_state=42)

# Fit PCA on the training images and transform both train and test images
train_image_features_pca = pca.fit_transform(train_image_features)
test_image_features_pca = pca.transform(test_image_features)

print(f"PCA complete. Image features reduced from 2048 to {N_COMPONENTS} dimensions.")
print("Shape of new train PCA features:", train_image_features_pca.shape)
print("Shape of new test PCA features:", test_image_features_pca.shape)
print("-" * 50)


# --- Step 6: Create Final Combined Datasets ---
print("Step 6: Creating final datasets for training and testing...")
# Apply text feature engineering
train_df = feature_engineer(train_df.copy())
test_df = feature_engineer(test_df.copy())

# Create TF-IDF features for the text
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)
train_text_features = tfidf.fit_transform(train_df['catalog_content'])
test_text_features = tfidf.transform(test_df['catalog_content'])

# Get our simple numerical features
train_numerical_features = train_df[['text_length', 'ipq']].values
test_numerical_features = test_df[['text_length', 'ipq']].values

# Combine everything into two final matrices: one for training, one for testing
X_train_final = hstack([train_text_features, train_image_features_pca, train_numerical_features]).tocsr()
X_test_final = hstack([test_text_features, test_image_features_pca, test_numerical_features]).tocsr()
y_train_final = np.log1p(train_df['price'])
print("Final combined datasets created successfully.")
print("-" * 50)

# --- Step 7: Train and Validate the V4 Model ---
print("Step 7: Training and validating the V4 PCA model...")
# We do one final validation split to get our score
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_final, y_train_final, test_size=0.2, random_state=42)
lgbm_final = lgb.LGBMRegressor(random_state=42, n_estimators=500, learning_rate=0.05) # Added some basic tuning
lgbm_final.fit(X_train_split, y_train_split)
val_preds = lgbm_final.predict(X_val_split)
final_smape = smape(y_val_split, val_preds)

print("\n--- V4 MODEL VALIDATION COMPLETE ---")
print(f"Previous Best Score (V2 Text Only): 0.5579")
print(f"Score for V3 (Text + Raw Images): 0.5624")
print(f"NEW V4 Score (Text + PCA Images): {final_smape:.4f}")
print("-" * 50)

# --- Step 8: Generate Final Submission ---
print("\nStep 8: Retraining on full data and generating final submission file...")
# We retrain the model on ALL available training data for the best performance
lgbm_final.fit(X_train_final, y_train_final)
test_predictions_log = lgbm_final.predict(X_test_final)
test_predictions = np.expm1(test_predictions_log)
test_predictions[test_predictions < 0] = 0

if not os.path.exists('submissions'): os.makedirs('submissions')
submission_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': test_predictions})
submission_df.to_csv('submissions/pca_model_submission.csv', index=False)
print("SUCCESS! New submission file 'pca_model_submission.csv' is ready in your Drive!")


Mounted at /content/drive
Working directory set to: /content/drive/.shortcut-targets-by-id/1avIasCHqris4iK4ri6WUTjhVPbSpzGPR/Amazon_ML_Challenge
--------------------------------------------------
All libraries for V4 model are ready.
--------------------------------------------------
Helper functions are defined.
--------------------------------------------------
Loading all data sources...
Successfully loaded all text data and both train/test image features.
--------------------------------------------------
Step 5: Applying PCA to clean and compress image features...


  self.explained_variance_ratio_ = self.explained_variance_ / total_var


PCA complete. Image features reduced from 2048 to 128 dimensions.
Shape of new train PCA features: (75000, 128)
Shape of new test PCA features: (75000, 128)
--------------------------------------------------
Step 6: Creating final datasets for training and testing...
Final combined datasets created successfully.
--------------------------------------------------
Step 7: Training and validating the V4 PCA model...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 10.454779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 569322
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 8805
[LightGBM] [Info] Start training from score 2.740904





--- V4 MODEL VALIDATION COMPLETE ---
Previous Best Score (V2 Text Only): 0.5579
Score for V3 (Text + Raw Images): 0.5624
NEW V4 Score (Text + PCA Images): 0.5492
--------------------------------------------------

Step 8: Retraining on full data and generating final submission file...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 13.104758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 651340
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 9273
[LightGBM] [Info] Start training from score 2.739217




SUCCESS! New submission file 'pca_model_submission.csv' is ready in your Drive!
