In [1]:
# ----------------------------------------------
# ✅ Step 1: Upload manually downloaded CSVs to S3
# ----------------------------------------------

import sys
import os

# Add scripts folder to path
scripts_path = os.path.abspath("scripts")
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

# Import S3 uploader
import s3_uploader

# Upload local CSVs from ./data to S3
s3_uploader.upload_folder_to_s3(
    local_folder="data",
    bucket_name="bigdatabi",
    s3_prefix="raw",
    profile_name="GenAI_Permission-688567268018"
)


⬆️ Uploading products.csv to s3://bigdatabi/raw/products.csv
⬆️ Uploading orders.csv to s3://bigdatabi/raw/orders.csv
⬆️ Uploading order_products__train.csv to s3://bigdatabi/raw/order_products__train.csv
⬆️ Uploading departments.csv to s3://bigdatabi/raw/departments.csv
⬆️ Uploading aisles.csv to s3://bigdatabi/raw/aisles.csv
⬆️ Uploading order_products__prior.csv to s3://bigdatabi/raw/order_products__prior.csv
✅ All files uploaded to S3.


In [1]:
# ---------------------------------------------------
# ✅ Step 2: Read Instacart CSVs from S3 and inspect
# ---------------------------------------------------

from scripts.s3_reader import read_csv_from_s3

bucket = "bigdatabi"
profile = "GenAI_Permission-688567268018"
prefix = "raw"

# Load main datasets
orders = read_csv_from_s3(bucket, f"{prefix}/orders.csv", profile)
prior = read_csv_from_s3(bucket, f"{prefix}/order_products__prior.csv", profile)
train = read_csv_from_s3(bucket, f"{prefix}/order_products__train.csv", profile)
products = read_csv_from_s3(bucket, f"{prefix}/products.csv", profile)
aisles = read_csv_from_s3(bucket, f"{prefix}/aisles.csv", profile)
departments = read_csv_from_s3(bucket, f"{prefix}/departments.csv", profile)

# Check number of rows in main file
print(f"✅ Rows in order_products__prior.csv: {prior.shape[0]:,}")


✅ Rows in order_products__prior.csv: 32,434,489


In [4]:
!pip install xgboost scikit-learn


Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0


In [2]:
import importlib
import scripts.data_preprocessor as dp
importlib.reload(dp)

df_ml = dp.prepare_features(orders, prior, train, products)


🔗 Merging prior orders with order info...
🧠 Creating user-product features from prior orders...
🏷️ Generating 0/1 labels from train users...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reordered_items['reordered'] = 1


✅ Final ML dataset: 8,474,661 rows, 9 columns
🔢 Label distribution:
reordered
0    7645837
1     828824
Name: count, dtype: int64


In [3]:
# ---------------------------------------------------------
# ✅ Step 4: Train and Evaluate XGBoost Model on Labeled Data
# ---------------------------------------------------------

# 📦 Import the model training function
from scripts.train_model import train_and_save_model

# 🧠 Train the model using the labeled df_ml dataset (~8.4M rows)
# ✨ Evaluates accuracy, precision, recall, F1
# 💾 Saves trained model to models/xgb_model.pkl for future inference
train_and_save_model(df_ml)


🚀 Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



✅ Accuracy: 0.9040
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95   1529168
           1       0.59      0.06      0.11    165765

    accuracy                           0.90   1694933
   macro avg       0.75      0.53      0.53   1694933
weighted avg       0.88      0.90      0.87   1694933

💾 Model saved to models/xgb_model.pkl


In [4]:
!pip install kafka-python pandas joblib xgboost




In [5]:
!brew services start zookeeper
!brew services start kafka


Service `zookeeper` already started, use `brew services restart zookeeper` to restart.
Service `kafka` already started, use `brew services restart kafka` to restart.


In [6]:
# Save only relevant columns used during prediction
df_ml.to_csv("data/features_merged.csv", index=False)


In [None]:
import importlib
import scripts.kafka_producer as kp  # replace with your module name
importlib.reload(kp)

# Now call the updated function
kp.stream_user_product_pairs(
    prior_csv="data/order_products__prior.csv",
    orders_csv="data/orders.csv"
)


📦 Loading prior and order data...
📡 Streaming to Kafka topic: reorder-stream...
