# Memory-Efficient ML Pipeline for Farm To Feed Dataset (Redo)

This notebook implements a memory-efficient machine learning pipeline to predict customer purchasing behavior for 1-week and 2-week windows using pandas, gc, and LightGBM. Includes FileUpload widgets for easy file handling in Colab.

In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from ipywidgets import FileUpload
import io

In [4]:
# File Upload Widgets
upload_train = FileUpload(accept='.csv', multiple=False, description='Train.csv')
upload_test = FileUpload(accept='.csv', multiple=False, description='Test.csv')
upload_customer = FileUpload(accept='.csv', multiple=False, description='customer_data.csv')
upload_sku = FileUpload(accept='.csv', multiple=False, description='sku_data.csv')

display(upload_train, upload_test, upload_customer, upload_sku)

FileUpload(value={}, accept='.csv', description='Train.csv')

FileUpload(value={}, accept='.csv', description='Test.csv')

FileUpload(value={}, accept='.csv', description='customer_data.csv')

FileUpload(value={}, accept='.csv', description='sku_data.csv')

In [None]:
# Step 1: Efficient Data Loading & Downcasting
print("Loading data...")
if upload_train.value:
    train = pd.read_csv(io.BytesIO(list(upload_train.value.values())[0]['content']))
else:
    train = pd.read_csv('Train.csv')

if upload_test.value:
    test = pd.read_csv(io.BytesIO(list(upload_test.value.values())[0]['content']))
else:
    test = pd.read_csv('Test.csv')

if upload_customer.value:
    customer = pd.read_csv(io.BytesIO(list(upload_customer.value.values())[0]['content']))
else:
    customer = pd.read_csv('customer_data.csv')

if upload_sku.value:
    sku = pd.read_csv(io.BytesIO(list(upload_sku.value.values())[0]['content']))
else:
    sku = pd.read_csv('sku_data.csv')

def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        max_val = df[col].max()
        if max_val < 2**8:
            df[col] = df[col].astype('int8')
        elif max_val < 2**16:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])

gc.collect()

Loading data...


NameError: name 'upload_train' is not defined

In [None]:
# Merge additional data
train = train.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
train = train.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))
test = test.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
test = test.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))

gc.collect()

In [None]:
# Step 2: Smart Grid Creation & Feature Engineering
full_df = pd.concat([train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
                     test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)])
full_df = downcast(full_df)
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

print("Creating features...")
full_df['lag1_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(1)
full_df['lag2_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(2)
full_df['lag3_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(3)
full_df['rolling_mean_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).mean().reset_index(0, drop=True)
full_df['rolling_max_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).max().reset_index(0, drop=True)

full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']] = full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']].fillna(0)

gc.collect()

In [None]:
# Merge features back
feature_cols = ['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']
train = train.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols], 
                    on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')
test = test.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols], 
                  on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')

del full_df
gc.collect()

In [None]:
# Step 3: Target Generation
print("Creating targets...")
train = train.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
train['target_purchase_1w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1) > 0).astype(int)
train['target_qty_1w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1).fillna(0)
train['target_purchase_2w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2) > 0).astype(int)
train['target_qty_2w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2).fillna(0)

train = train.dropna(subset=['target_purchase_1w'])

gc.collect()

In [None]:
# Encode categoricals
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    combined = pd.concat([train[col], test[col]])
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

features = ['selling_price', 'customer_category', 'customer_status', 'grade_name', 'unit_name'] + feature_cols

In [None]:
# Step 4: Modeling
print("Training models...")
weeks = sorted(train['week_start'].unique())
val_weeks = weeks[-2:]
val_mask = train['week_start'].isin(val_weeks)

models = {}
targets = ['target_purchase_1w', 'target_qty_1w', 'target_purchase_2w', 'target_qty_2w']
for target in targets:
    print(f"Training {target}...")
    X = train[features]
    y = train[target]
    X_train, X_val = X[~val_mask], X[val_mask]
    y_train, y_val = y[~val_mask], y[val_mask]
    
    if 'purchase' in target:
        model = lgb.LGBMClassifier(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')
        pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, pred_val)
        print(f"AUC for {target}: {auc}")
    else:
        model = lgb.LGBMRegressor(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mae')
        pred_val = model.predict(X_val)
        mae = mean_absolute_error(y_val, pred_val)
        print(f"MAE for {target}: {mae}")
    
    models[target] = model
    gc.collect()

In [None]:
# Step 5: Submission
print("Generating predictions...")
test['Target_purchase_next_1w'] = models['target_purchase_1w'].predict_proba(test[features])[:, 1]
test['Target_qty_next_1w'] = models['target_qty_1w'].predict(test[features])
test['Target_purchase_next_2w'] = models['target_purchase_2w'].predict_proba(test[features])[:, 1]
test['Target_qty_next_2w'] = models['target_qty_2w'].predict(test[features])

submission = test[['ID', 'Target_purchase_next_1w', 'Target_qty_next_1w', 'Target_purchase_next_2w', 'Target_qty_next_2w']]
submission.to_csv('submission.csv', index=False)
print("Submission saved.")