In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [None]:
import tensorflow as tf
tf.config.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
import gdown
import os

In [None]:
TRAIN_IMG_FILE_ID = '1yVXguacT7OFT9wtRSW2sP0wSPlrQpFqm'
TRAIN_TXT_FILE_ID = '1Ge21if-qDkyugYOPKSV5-4jN1AATwWw4'
TEST_IMG_FILE_ID = '1j3-BietmC0xB0mXjbGqHXU0nviVITmHr'
TEST_TXT_FILE_ID = '1Ge21if-qDkyugYOPKSV5-4jN1AATwWw4'

print("Downloading datasets from Google Drive...")

# Train dataset
train_img_url = f'https://drive.google.com/uc?id={TRAIN_IMG_FILE_ID}'
train_img_output = 'train_img_dataset.csv'
gdown.download(train_img_url, train_img_output, quiet=False)

train_txt_url = f'https://drive.google.com/uc?id={TRAIN_TXT_FILE_ID}'
train_txt_output = 'train_txt_dataset.csv'
gdown.download(train_txt_url, train_txt_output, quiet=False)

df_train_img = pd.read_csv('train_img_dataset.csv')
df_train_txt = pd.read_csv('train_txt_dataset.csv')

final_train = pd.concat([df_train_img, df_train_txt], axis=1).loc[:, ~pd.concat([df_train_img, df_train_txt], axis=1).columns.duplicated()]

print(f"Merged shape: {final_train.shape}")
final_train.to_csv('final_train_dataset.csv', index=False)

# Test dataset
test_img_url = f'https://drive.google.com/uc?id={TEST_IMG_FILE_ID}'
test_img_output = 'test_img_dataset.csv'
gdown.download(test_img_url, test_img_output, quiet=False)

# test_txt_url = f'https://drive.google.com/uc?id={TEST_TXT_FILE_ID}'
# test_txt_output = 'train_txt_dataset.csv'
# gdown.download(test_txt_url, test_txt_output, quiet=False)

# df_test_img = pd.read_csv('test_img_dataset.csv')
# df_test_txt = pd.read_csv('test_txt_dataset.csv')

# Merge horizontally, avoid duplicate columns
# final_test = pd.concat([df_test_img, df_test_txt], axis=1).loc[:, ~pd.concat([df_test_img, df_test_txt], axis=1).columns.duplicated()]

# print(f"Merged shape: {final_test.shape}")
# final_test.to_csv('final_test_dataset.csv', index=False)

print("Download completed!")



In [None]:
df_train = pd.read_csv("final_train_dataset.csv")
df_test = pd.read_csv("test_img_dataset.csv")

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

# Display columns to verify structure
# print("Train columns:", df_train.columns.tolist())
# print("Test columns:", df_test.columns.tolist())

In [None]:
# Text preprocessing function
def preprocess_text(df):
    df = df.copy()
    
    # Fill missing values
    df['item_name'] = df['item_name'].fillna("No Data")
    df['bullet_points'] = df['bullet_points'].fillna("No Data")
    df['brand_name'] = df['brand_name'].fillna("No Data")
    df['unit'] = df['unit'].fillna("Ounce")
    df['value'] = df['value'].fillna(df['value'].median())
    
    # Clean text
    punctuation_signs = list("?:!.,;")
    
    for col in ['item_name', 'bullet_points']:
        df[col] = df[col].str.replace("\r", " ")
        df[col] = df[col].str.replace("\n", " ")
        df[col] = df[col].str.replace("    ", " ")
        df[col] = df[col].str.replace('"', '')
        df[col] = df[col].str.lower()
        
        for punct_sign in punctuation_signs:
            df[col] = df[col].str.replace(punct_sign, '')
        df[col] = df[col].str.replace("'s", "")
    
    return df

# Apply preprocessing
df_train = preprocess_text(df_train)
df_test = preprocess_text(df_test)

print("Text preprocessing completed!")

In [None]:
from sklearn.model_selection import train_test_split

# Split the training data into train/validation sets
X_temp = df_train.drop(['price'], axis=1)
y_temp = df_train['price']

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, 
    test_size=0.1, 
    random_state=42
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {df_test.shape[0]} samples")

In [None]:
# Identify image embedding columns (assuming they start with 'embedding_' or similar)
img_embedding_cols = [col for col in X_train.columns if col.startswith('dim_')]
txt_embedding_cols = [col for col in X_train.columns if col.startswith('embedding_')]

print(f"Found {len(img_embedding_cols)} embedding columns")
print(f"Found {len(txt_embedding_cols)} embedding columns")
print("First 5 img embedding columns:", img_embedding_cols[:5])
print("First 5 text embedding columns:", txt_embedding_cols[:5])

# Extract embeddings
X_train_img_embeddings = X_train[img_embedding_cols].values
X_val_img_embeddings = X_val[img_embedding_cols].values
X_train_txt_embeddings = X_train[txt_embedding_cols].values
X_val_txt_embeddings = X_val[txt_embedding_cols].values
# X_test_img_embeddings = df_test[img_embedding_cols].values
# X_test_txt_embeddings = df_test[txt_embedding_cols].values

print(f"Train img embeddings shape: {X_train_img_embeddings.shape}")
print(f"Val img embeddings shape: {X_val_img_embeddings.shape}")
print(f"Train txt embeddings shape: {X_train_txt_embeddings.shape}")
print(f"Val txt embeddings shape: {X_val_txt_embeddings.shape}")
# print(f"Test img embeddings shape: {X_test_img_embeddings.shape}")
# print(f"Test txt embeddings shape: {X_test_txt_embeddings.shape}")

In [None]:
# Combine text features
X_train_text = X_train['item_name'] + ' ' + X_train['bullet_points']
X_val_text = X_val['item_name'] + ' ' + X_val['bullet_points']
X_test_text = df_test['item_name'] + ' ' + df_test['bullet_points']

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=5, max_features=2000)
print("Fitting TF-IDF...")

X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf = tfidf.transform(X_val_text)
# X_test_tfidf = tfidf.transform(X_test_text)

print(f"TF-IDF train shape: {X_train_tfidf.shape}")
print(f"TF-IDF val shape: {X_val_tfidf.shape}")
# print(f"TF-IDF test shape: {X_test_tfidf.shape}")

In [None]:
# Reduce TF-IDF dimensions
svd_tfidf = TruncatedSVD(n_components=1000, random_state=42)
svd_img_embed = TruncatedSVD(n_components=500, random_state=42)
svd_txt_embed = TruncatedSVD(n_components=300, random_state=42)
print("Applying dimensionality reduction...")

X_train_tfidf_reduced = svd_tfidf.fit_transform(X_train_tfidf)
X_val_tfidf_reduced = svd_tfidf.transform(X_val_tfidf)
# X_test_tfidf_reduced = svd_tfidf.transform(X_test_tfidf)

X_train_img_embeddings_reduced = svd_img_embed.fit_transform(X_train_img_embeddings)
X_val_img_embeddings_reduced = svd_img_embed.transform(X_val_img_embeddings)
# X_test_img_embeddings_reduced = svd_img_embed.transform(X_test_img_embeddings)

X_train_txt_embeddings_reduced = svd_txt_embed.fit_transform(X_train_txt_embeddings)
X_val_txt_embeddings_reduced = svd_txt_embed.transform(X_val_txt_embeddings)
# X_test_txt_embeddings_reduced = svd_txt_embed.transform(X_test_txt_embeddings)


print(f"Reduced TF-IDF train shape: {X_train_tfidf_reduced.shape}")
print(f"Reduced TF-IDF val shape: {X_val_tfidf_reduced.shape}")
# print(f"Reduced TF-IDF test shape: {X_test_tfidf_reduced.shape}")

print(f"Reduced Img Embeddings train shape: {X_train_img_embeddings_reduced.shape}")
print(f"Reduced Img Embeddings val shape: {X_val_img_embeddings_reduced.shape}")
# print(f"Reduced Img Embeddings test shape: {X_test_img_embeddings_reduced.shape}")

print(f"Reduced Txt Embeddings train shape: {X_train_txt_embeddings_reduced.shape}")
print(f"Reduced Txt Embeddings val shape: {X_val_txt_embeddings_reduced.shape}")
# print(f"Reduced Txt Embeddings test shape: {X_test_txt_embeddings_reduced.shape}")


print(f"Explained variance ratio: {svd_tfidf.explained_variance_ratio_.sum():.4f}")
print(f"Explained variance ratio: {svd_img_embed.explained_variance_ratio_.sum():.4f}")
print(f"Explained variance ratio: {svd_txt_embed.explained_variance_ratio_.sum():.4f}")

In [None]:
# Extract numerical features
X_train_numerical = X_train[['value']].values
X_val_numerical = X_val[['value']].values
# X_test_numerical = df_test[['value']].values

print("Combining all features...")
# Combine TF-IDF + Numerical + Image Embeddings
X_train_combined = np.hstack([
    X_train_tfidf_reduced, 
    X_train_numerical, 
    X_train_img_embeddings_reduced,
    X_train_txt_embeddings_reduced
])

X_val_combined = np.hstack([
    X_val_tfidf_reduced, 
    X_val_numerical, 
    X_val_img_embeddings_reduced,
    X_val_txt_embeddings_reduced
])

# X_test_combined = np.hstack([
#     X_test_tfidf_reduced, 
#     X_test_numerical, 
#     X_test_img_embeddings_reduced,
#     X_test_txt_embeddings_reduced
# ])

print(f"Combined train shape: {X_train_combined.shape}")
print(f"Combined val shape: {X_val_combined.shape}")
# print(f"Combined test shape: {X_test_combined.shape}")

In [None]:
# Train Linear Regression
from xgboost import XGBRegressor
from sklearn.linear_model import BayesianRidge
from lightgbm import LGBMRegressor
print("Training Linear Regression model...")
# model = LinearRegression()
model = LGBMRegressor(n_estimators=500, random_state=42)
model.fit(X_train_combined, y_train)

print("Model training completed!")

# Make predictions on validation set
y_val_pred = model.predict(X_val_combined)

# Calculate metrics
mse = mean_squared_error(y_val, y_val_pred)
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Validation Metrics:")
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")

In [None]:
# Create validation results dataframe
val_results = pd.DataFrame({
    'sample_id': X_val['sample_id'],
    'actual_price': y_val,
    'predicted_price': y_val_pred
})

# Save validation results
val_results.to_csv('/kaggle/working/validation_predictions.csv', index=False)
print("Validation predictions saved to 'validation_predictions.csv'")
print(f"Validation results shape: {val_results.shape}")
print(val_results.head())

In [None]:
# Save the model and preprocessing objects
joblib.dump(model, 'linear_regression_combined_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(svd_tfidf, 'svd_tfidf_reducer.pkl')
joblib.dump(svd_embed, 'svd_embed_reducer.pkl')

print("Model and preprocessing objects saved!")
print("Files saved:")
print("- linear_regression_combined_model.pkl")
print("- tfidf_vectorizer.pkl")
print("- svd_reducer.pkl")

In [None]:
# Make predictions on test set
print("Making predictions on test set...")
y_test_pred = model.predict(X_test_combined)

# Create submission file
submission_df = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'pred_price': y_test_pred
})

# Save submission file
submission_df.to_csv('submission_final.csv', index=False)
print("Test predictions saved to 'submission_final.csv'")
print(f"Submission file shape: {submission_df.shape}")
print(submission_df.head())

In [None]:
print("=" * 50)
print("RESULTS SUMMARY")
print("=" * 50)
print(f"Training samples: {X_train.shape[0]}")
print(f"Validation samples: {X_val.shape[0]}")
print(f"Test samples: {df_test.shape[0]}")
print(f"Final feature dimensions: {X_train_combined.shape[1]}")
print(f"Validation R² Score: {r2:.4f}")
print(f"Validation MAE: {mae:.4f}")
print("\nFiles created:")
print("- validation_predictions.csv (actual vs predicted for validation set)")
print("- submission_final.csv (test predictions for submission)")
print("- linear_regression_combined_model.pkl")
print("- tfidf_vectorizer.pkl")
print("- svd_reducer.pkl")