In [1]:
# Download the file from Google Drive
!gdown --id 1w0De0D9R8aQ55ifKg_V1CmW_CVuuVATr

Downloading...
From (original): https://drive.google.com/uc?id=1w0De0D9R8aQ55ifKg_V1CmW_CVuuVATr
From (redirected): https://drive.google.com/uc?id=1w0De0D9R8aQ55ifKg_V1CmW_CVuuVATr&confirm=t&uuid=5c9b9d3b-068e-4e76-9d5a-e13501cda66b
To: /content/workspace.zip
100% 195M/195M [00:02<00:00, 71.4MB/s]


In [2]:
!unzip /content/workspace.zip

Archive:  /content/workspace.zip
   creating: workspace/
  inflating: workspace/users.parquet  
  inflating: workspace/X_all.npy     
  inflating: workspace/X_emb.npy     


In [5]:
# ======================================================================
# 00_make_train_test_split.ipynb
# Author: Debjit
#
# Goal:
#   Create a single, shared train/test split for all models.
#   This script:
#     - Loads X_all.npy and users.parquet
#     - Creates train/test indices with a fixed random_state
#     - Saves indices and optional split files into /content/workspace
#
# After running this, all teammates should reuse train_idx.npy/test_idx.npy.
# ======================================================================

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# --- CONFIG: adjust if your path is slightly different ----------------
WORKSPACE_DIR = Path("/content/workspace")   # or Path("workspace") if you're in project root

X_ALL_PATH = WORKSPACE_DIR / "X_all.npy"
USERS_PATH = WORKSPACE_DIR / "users.parquet"

print("Loading data from:")
print("  ", X_ALL_PATH)
print("  ", USERS_PATH)

X_all = np.load(X_ALL_PATH)          # shape: (n_users, n_features)
users = pd.read_parquet(USERS_PATH)  # columns: user_id, ...

print("Shapes:")
print("  X_all :", X_all.shape)
print("  users :", users.shape)

n_users = X_all.shape[0]
assert n_users == len(users), "X_all and users row count must match!"

# --- Create index array ------------------------------------------------
all_idx = np.arange(n_users)

# IMPORTANT: fixed random_state so split is reproducible for everyone
TRAIN_SIZE = 0.8   # 80% train, 20% test (change if you want)
RANDOM_STATE = 42

train_idx, test_idx = train_test_split(
    all_idx,
    train_size=TRAIN_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
)

print(f"\nTrain size: {len(train_idx)}")
print(f"Test size : {len(test_idx)}")

# --- Save indices so everyone can reuse them ---------------------------
TRAIN_IDX_PATH = WORKSPACE_DIR / "train_idx.npy"
TEST_IDX_PATH  = WORKSPACE_DIR / "test_idx.npy"

np.save(TRAIN_IDX_PATH, train_idx)
np.save(TEST_IDX_PATH, test_idx)

print(f"\nSaved index files:")
print("  ", TRAIN_IDX_PATH)
print("  ", TEST_IDX_PATH)

# --- (Optional) also save split X_all and users ------------------------
X_all_train = X_all[train_idx]
X_all_test  = X_all[test_idx]

users_train = users.iloc[train_idx].reset_index(drop=True)
users_test  = users.iloc[test_idx].reset_index(drop=True)

X_ALL_TRAIN_PATH = WORKSPACE_DIR / "X_all_train.npy"
X_ALL_TEST_PATH  = WORKSPACE_DIR / "X_all_test.npy"
USERS_TRAIN_PATH = WORKSPACE_DIR / "users_train.parquet"
USERS_TEST_PATH  = WORKSPACE_DIR / "users_test.parquet"

np.save(X_ALL_TRAIN_PATH, X_all_train)
np.save(X_ALL_TEST_PATH, X_all_test)
users_train.to_parquet(USERS_TRAIN_PATH, index=False)
users_test.to_parquet(USERS_TEST_PATH, index=False)

print("\nAlso saved split data:")
print("  ", X_ALL_TRAIN_PATH)
print("  ", X_ALL_TEST_PATH)
print("  ", USERS_TRAIN_PATH)
print("  ", USERS_TEST_PATH)

print("\n Global train/test split created. Share these files with your team.")


Loading data from:
   /content/workspace/X_all.npy
   /content/workspace/users.parquet
Shapes:
  X_all : (206207, 132)
  users : (206207, 1)

Train size: 164965
Test size : 41242

Saved index files:
   /content/workspace/train_idx.npy
   /content/workspace/test_idx.npy

Also saved split data:
   /content/workspace/X_all_train.npy
   /content/workspace/X_all_test.npy
   /content/workspace/users_train.parquet
   /content/workspace/users_test.parquet

 Global train/test split created. Share these files with your team.


In [4]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

WORKSPACE_DIR = Path("/content/workspace")

X_all = np.load(WORKSPACE_DIR / "X_all.npy")
users = pd.read_parquet(WORKSPACE_DIR / "users.parquet")

train_idx = np.load(WORKSPACE_DIR / "train_idx.npy")
test_idx  = np.load(WORKSPACE_DIR / "test_idx.npy")

X_train = X_all[train_idx]
X_test  = X_all[test_idx]

# optional: users_train/users_test if needed
users_train = users.iloc[train_idx].reset_index(drop=True)
users_test  = users.iloc[test_idx].reset_index(drop=True)

print(X_train.shape, X_test.shape, users_train.shape, users_test.shape)


(164965, 132) (41242, 132) (164965, 1) (41242, 1)


In [6]:
! zip -r workspace_updated.zip workspace

  adding: workspace/ (stored 0%)
  adding: workspace/train_idx.npy (deflated 62%)
  adding: workspace/X_emb.npy (deflated 9%)
  adding: workspace/users_train.parquet (deflated 19%)
  adding: workspace/users.parquet (deflated 75%)
  adding: workspace/X_all.npy (deflated 10%)
  adding: workspace/test_idx.npy (deflated 62%)
  adding: workspace/users_test.parquet (deflated 19%)
  adding: workspace/X_all_train.npy (deflated 7%)
  adding: workspace/X_all_test.npy (deflated 7%)
