# 00 – Load Customer Vectors

Prepare shared data files for clustering (Phase 3).

## 1. Prerequisites
- Phase 2 has produced `outputs_phase2/customer_features.parquet`.
- That file has: `user_id`, embedding columns `v0...`, and some behavior columns.

In [None]:
# 1) install gdown (only first time)
!pip install -q gdown

# 2) download the zip from your public Drive link
FILE_ID = "1Zwcuk5k6AuPB1yly6IbSbZL2chYWxGVS"
OUTPUT_ZIP = "phase2_outputs.zip"

!gdown --id $FILE_ID -O $OUTPUT_ZIP

# 3) unzip into a project folder
!mkdir -p /content/project
!unzip -o $OUTPUT_ZIP -d /content/project


Downloading...
From (original): https://drive.google.com/uc?id=1Zwcuk5k6AuPB1yly6IbSbZL2chYWxGVS
From (redirected): https://drive.google.com/uc?id=1Zwcuk5k6AuPB1yly6IbSbZL2chYWxGVS&confirm=t&uuid=6b549f14-8bc6-4e3d-bd2c-fb85c1a05ef4
To: /content/phase2_outputs.zip
100% 1.32G/1.32G [00:21<00:00, 61.8MB/s]
Archive:  phase2_outputs.zip
  inflating: /content/project/outputs_phase2/holdout_next_basket_long.parquet  
  inflating: /content/project/outputs_phase2/plots/umap_customers_prior_order_count.png  
  inflating: /content/project/outputs_phase2/umap_customers_2d.csv  
  inflating: /content/project/outputs_phase2/holdout_next_basket_wide.csv  
  inflating: /content/project/outputs_phase2/holdout_next_basket.jsonl  
 extracting: /content/project/outputs_phase2/baskets_prior.jsonl  
  inflating: /content/project/outputs_phase2/product_vectors.parquet  
  inflating: /content/project/outputs_phase2/product2vec.model  
  inflating: /content/project/outputs_phase2/customer_vectors.parquet  
  

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np

PHASE2_FILE = Path('/content/project/outputs_phase2/customer_vectors.parquet')
WORKSPACE_DIR = Path('/content/project/workspace')
WORKSPACE_DIR.mkdir(parents=True, exist_ok=True)

PHASE2_FILE, WORKSPACE_DIR

(PosixPath('/content/project/outputs_phase2/customer_vectors.parquet'),
 PosixPath('/content/project/workspace'))

## 2. Load Phase 2 data

In [None]:
if not PHASE2_FILE.exists():
    raise FileNotFoundError(f"Missing {PHASE2_FILE}, run Phase 2 first.")

cust = pd.read_parquet(PHASE2_FILE)
print('Loaded:', cust.shape)
cust.head()

Loaded: (206207, 133)


Unnamed: 0,user_id,v0,v1,v2,v3,v4,v5,v6,v7,v8,...,v122,v123,v124,v125,v126,v127,prior_order_count,avg_basket_size,reorder_ratio,mean_days_between
0,1,0.301228,0.036102,0.00407,-0.386831,-0.422758,0.191923,0.016456,0.241248,-0.457355,...,-0.128784,-0.259829,0.127665,0.435856,-0.122861,-0.123196,10,5.9,0.694915,19.555555
1,2,0.283765,-0.144311,-0.206828,0.020989,-0.116576,0.181473,-0.069774,0.125167,-0.016939,...,0.151149,-0.351373,0.070755,0.067892,0.054679,0.130672,14,13.928571,0.476923,15.230769
2,3,0.167002,-0.210173,-0.233505,0.082396,-0.093267,0.102814,-0.08075,0.090369,-0.130221,...,0.10084,-0.323671,-0.045346,0.012843,0.057618,0.124582,12,7.333333,0.625,12.090909
3,4,0.063215,-0.04633,-0.056892,-0.025792,-0.143455,0.207918,-0.132056,0.109053,-0.301736,...,0.04695,-0.206911,0.352095,0.142234,-0.008649,-0.109761,5,3.6,0.055556,13.75
4,5,0.219009,-0.211876,-0.248602,0.052808,-0.061111,0.151793,-0.018911,0.030394,-0.055601,...,-0.014088,-0.272834,0.115603,-0.00637,-0.028278,0.05001,4,9.25,0.378378,13.333333


## 3. Select embedding and behavioral columns

In [None]:
emb_cols = [c for c in cust.columns if c.startswith('v')]
beh_default = ['prior_order_count', 'avg_basket_size', 'reorder_ratio', 'mean_days_between']
beh_cols = [c for c in beh_default if c in cust.columns]

print('Embedding cols:', len(emb_cols))
print('Behavioral cols:', beh_cols)

Embedding cols: 128
Behavioral cols: ['prior_order_count', 'avg_basket_size', 'reorder_ratio', 'mean_days_between']


## 4. Save numpy arrays to `workspace/`

In [None]:
X_emb = cust[emb_cols].to_numpy(dtype=np.float32)
if beh_cols:
    X_all = cust[emb_cols + beh_cols].to_numpy(dtype=np.float32)
else:
    X_all = X_emb.copy()

np.save(WORKSPACE_DIR / 'X_emb.npy', X_emb)
np.save(WORKSPACE_DIR / 'X_all.npy', X_all)
print('saved X_emb.npy and X_all.npy')

saved X_emb.npy and X_all.npy


## 5. Save user_id mapping

In [None]:
users = cust[['user_id']].copy()
users.to_parquet(WORKSPACE_DIR / 'users.parquet', index=False)
print('saved users.parquet')

saved users.parquet


In [None]:
import shutil
shutil.make_archive("/content/workspace", "zip", "/content", "project/workspace")

'/content/workspace.zip'

## 6. Summary
- Input: `outputs_phase2/customer_features.parquet`
- Output: `workspace/X_emb.npy`, `workspace/X_all.npy`, `workspace/users.parquet`
- Hand these files to teammates so they can run KMeans / Hierarchical / UMAP+HDBSCAN independently.