In [None]:
# EDA: Smart Product Pricing Challenge
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Resolve data directory
REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) if '__file__' in globals() else os.path.abspath(os.path.join(os.getcwd()))
ENV_DATA_DIR = os.environ.get('DATA_DIR')
SR_DATA_DIR = os.path.join(REPO_ROOT, 'student_resource', 'dataset')
FALLBACK_DATA_DIR = os.path.join(REPO_ROOT, 'dataset')

if ENV_DATA_DIR and os.path.isdir(ENV_DATA_DIR):
    DATA_DIR = ENV_DATA_DIR
elif os.path.isdir(SR_DATA_DIR):
    DATA_DIR = SR_DATA_DIR
else:
    DATA_DIR = FALLBACK_DATA_DIR

train_path = os.path.join(DATA_DIR, 'train.csv')
print('Using DATA_DIR =', DATA_DIR)

df = pd.read_csv(train_path)
df.head()


In [None]:
# Basic info
print(df.shape)
df.info()

df[['sample_id','catalog_content','image_link','price']].head(3)


In [None]:
# Missingness
missing = df.isna().mean().sort_values(ascending=False)
missing.head(10), missing.describe()


In [None]:
# Price distribution
fig, ax = plt.subplots(1,2, figsize=(12,4))
sns.histplot(df['price'].dropna(), bins=100, ax=ax[0])
ax[0].set_title('Price distribution')

sns.histplot(np.log(df['price'].clip(lower=0.01)), bins=100, ax=ax[1])
ax[1].set_title('Log price distribution')
plt.tight_layout()
plt.show()


In [None]:
# Simple IPQ extraction from text
import re

def extract_ipq(text: str) -> float:
    if not isinstance(text, str):
        return np.nan
    # examples: "Pack of 6", "6 pack", "x6", "(6 pcs)", "6-count"
    patterns = [
        r"pack of\s*(\d+)",
        r"(\d+)\s*pack",
        r"\b(\d+)\s*(?:pcs|pieces|counts?|ct)\b",
        r"x\s*(\d+)\b",
        r"\b(\d+)[- ]?count\b",
    ]
    for p in patterns:
        m = re.search(p, text, flags=re.IGNORECASE)
        if m:
            try:
                return float(m.group(1))
            except Exception:
                continue
    return np.nan

sample_texts = df['catalog_content'].head(10).tolist()
[(t[:60] + '...', extract_ipq(t)) for t in sample_texts]
