# Memory-Efficient ML Pipeline for Farm To Feed Dataset (Colab-friendly)

This notebook is a Colab-ready version of pipeline_redo.ipynb. It downloads the required CSVs from the repository when possible and falls back to interactive upload. It also installs LightGBM and contains the same memory-efficient pipeline with downcasting and LightGBM models.


In [None]:
# Install dependencies
!pip install -q lightgbm
# requests is in the runtime by default; ipywidgets not required in Colab


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import os, requests, io


In [None]:
# Data-loading: attempt to download from the repo at the provided commit, otherwise fall back to manual upload
print('Attempting to download CSVs from GitHub raw...')
base = 'https://raw.githubusercontent.com/clementina-tom/Feed-to-farm-competition/6fdcfec5fb497555503a589050ef8876818297dc/'
files = ['Train.csv', 'Test.csv', 'customer_data.csv', 'sku_data.csv']
for fname in files:
    url = base + fname
    out = fname
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        open(out, 'wb').write(r.content)
        print('Downloaded', out)
    except Exception as e:
        print('Could not download', out, '-', e)
# If any files missing, fall back to interactive upload in Colab
missing = [f for f in files if not os.path.exists(f)]
if missing:
    print('Missing files:', missing)
    try:
        from google.colab import files as colab_files
        uploaded = colab_files.upload()
        for k, v in uploaded.items():
            open(k, 'wb').write(v)
            print('Uploaded', k)
    except Exception as e:
        print('Colab upload unavailable or skipped:', e)

# Read CSVs into dataframes
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
customer = pd.read_csv('customer_data.csv')
sku = pd.read_csv('sku_data.csv')
print('Shapes:', train.shape, test.shape, customer.shape, sku.shape)


In [None]:
# Downcast helper and preprocessing
def downcast(df):
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    for col in df.select_dtypes(include=['int64']).columns:
        max_val = df[col].max()
        if pd.isna(max_val):
            continue
        if max_val < 2**7:
            df[col] = df[col].astype('int8')
        elif max_val < 2**15:
            df[col] = df[col].astype('int16')
        else:
            df[col] = df[col].astype('int32')
    return df

train = downcast(train)
test = downcast(test)
customer = downcast(customer)
sku = downcast(sku)

train['week_start'] = pd.to_datetime(train['week_start'])
test['week_start'] = pd.to_datetime(test['week_start'])
customer['customer_created_at'] = pd.to_datetime(customer['customer_created_at'])
gc.collect()
print('Downcasting and datetime conversion done')


In [None]:
# Merge metadata
train = train.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
train = train.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))
test = test.merge(customer, on='customer_id', how='left', suffixes=('', '_cust'))
test = test.merge(sku, on='product_unit_variant_id', how='left', suffixes=('', '_sku'))
gc.collect()
print('Merged customer and sku metadata')


In [None]:
# Feature engineering using a memory-friendly approach
full_df = pd.concat([train[['customer_id', 'product_unit_variant_id', 'week_start', 'qty_this_week']],
                     test[['customer_id', 'product_unit_variant_id', 'week_start']].assign(qty_this_week=0.0)])
full_df = downcast(full_df)
full_df = full_df.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])

print('Creating lag & rolling features...')
full_df['lag1_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(1)
full_df['lag2_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(2)
full_df['lag3_qty'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(3)
full_df['rolling_mean_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).mean().reset_index(0, drop=True)
full_df['rolling_max_4'] = full_df.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].rolling(4).max().reset_index(0, drop=True)

full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']] = full_df[['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']].fillna(0)
gc.collect()


In [None]:
# Merge features back into train/test
feature_cols = ['lag1_qty', 'lag2_qty', 'lag3_qty', 'rolling_mean_4', 'rolling_max_4']
train = train.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols], 
                    on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')
test = test.merge(full_df[['customer_id', 'product_unit_variant_id', 'week_start'] + feature_cols], 
                  on=['customer_id', 'product_unit_variant_id', 'week_start'], how='left')
del full_df
gc.collect()
print('Features merged')


In [None]:
# Target generation
print('Creating targets...')
train = train.sort_values(['customer_id', 'product_unit_variant_id', 'week_start'])
train['target_purchase_1w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1) > 0).astype(int)
train['target_qty_1w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-1).fillna(0)
train['target_purchase_2w'] = (train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2) > 0).astype(int)
train['target_qty_2w'] = train.groupby(['customer_id', 'product_unit_variant_id'])['qty_this_week'].shift(-2).fillna(0)
train = train.dropna(subset=['target_purchase_1w'])
gc.collect()
print('Targets created')


In [None]:
# Encode categorical columns robustly
cat_cols = ['customer_category', 'customer_status', 'grade_name', 'unit_name']
le = LabelEncoder()
for col in cat_cols:
    if col in train.columns and col in test.columns:
        combined = pd.concat([train[col].astype(str), test[col].astype(str)])
        le.fit(combined)
        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))

features = ['selling_price', 'customer_category', 'customer_status', 'grade_name', 'unit_name'] + feature_cols
print('Categorical encoding done')


In [None]:
# Modeling
print('Training models...')
weeks = sorted(train['week_start'].unique())
val_weeks = weeks[-2:] if len(weeks) >= 2 else weeks
val_mask = train['week_start'].isin(val_weeks)
models = {}
targets = ['target_purchase_1w', 'target_qty_1w', 'target_purchase_2w', 'target_qty_2w']
for target in targets:
    print(f'Training {target}...')
    X = train[features].fillna(0)
    y = train[target]
    X_train, X_val = X[~val_mask], X[val_mask]
    y_train, y_val = y[~val_mask], y[val_mask]
    if 'purchase' in target:
        model = lgb.LGBMClassifier(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')
        pred_val = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, pred_val)
        print(f'AUC for {target}: {auc}')
    else:
        model = lgb.LGBMRegressor(n_estimators=1000, early_stopping_rounds=50, verbose=-1)
        model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='mae')
        pred_val = model.predict(X_val)
        mae = mean_absolute_error(y_val, pred_val)
        print(f'MAE for {target}: {mae}')
    models[target] = model
    gc.collect()
print('Modeling complete')


In [None]:
# Generate predictions and save submission.csv
print('Generating predictions...')
test_features = test[features].fillna(0)
test['Target_purchase_next_1w'] = models['target_purchase_1w'].predict_proba(test_features)[:, 1]
test['Target_qty_next_1w'] = models['target_qty_1w'].predict(test_features)
test['Target_purchase_next_2w'] = models['target_purchase_2w'].predict_proba(test_features)[:, 1]
test['Target_qty_next_2w'] = models['target_qty_2w'].predict(test_features)
submission = test[['ID', 'Target_purchase_next_1w', 'Target_qty_next_1w', 'Target_purchase_next_2w', 'Target_qty_next_2w']]
submission.to_csv('submission.csv', index=False)
print('submission.csv saved')
