<a href="https://colab.research.google.com/github/fkhandley/msds6925/blob/main/MSDS6825_practicum_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import MinMaxScaler
import time

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
dtype_dict = {
    'order_id': 'string',
    'user_id': 'string',
    'payment_method': 'category',
    'user_zipcode_current': 'category',
    'gender': 'category',
    'product_category': 'category',
    'strain_type': 'category'
}

In [49]:
orders_load = pd.read_csv('/content/drive/MyDrive/orders.csv', dtype=dtype_dict)

In [50]:
orders_load['delivery_datetime'] = pd.to_datetime(orders_load['delivery_datetime'])

In [51]:
user_orders = orders_load.groupby('user_id').size().reset_index(name='order_count')
print(f"Original number of unique users: {len(user_orders)}")
user_orders5 = user_orders[user_orders['order_count'] >= 5]
print(f"Users with 5+ orders: {len(user_orders5)}")
users5 = user_orders5['user_id'].unique()
user_size = round(len(users5)*0.01,0)
print(f"Taking sample of {user_size} users")
user_sample = np.random.choice(users5, size=int(user_size), replace=False)
orders_load = orders_load[orders_load['user_id'].isin(user_sample)]
final_orders_per_user = orders_load.groupby('user_id').size()
print("\nFinal orders per user:")
print(final_orders_per_user.describe())

Original number of unique users: 884901
Users with 5+ orders: 265985
Taking sample of 2660.0 users

Final orders per user:
count    2660.000000
mean       20.513158
std        26.585777
min         5.000000
25%         7.000000
50%        11.000000
75%        23.000000
max       319.000000
dtype: float64


In [52]:
freq_df = orders_load[['order_id','user_id','delivery_datetime']].copy()
freq_df.drop_duplicates('order_id', inplace=True)
freq_df.sort_values('delivery_datetime', inplace=True)

# Create the shifted dates within each user group
freq_df['last_delivery_date'] = freq_df.groupby('user_id')['delivery_datetime'].shift(1)

# Calculate the days between orders
freq_df['days_since_order'] = (freq_df['delivery_datetime'] - freq_df['last_delivery_date']).dt.days

In [53]:
avg_freq_user = freq_df.groupby('user_id')['days_since_order'].mean().reset_index()
avg_freq_user.rename(columns={'days_since_order':'avg_freq'}, inplace=True)
orders_load = orders_load.merge(freq_df[['order_id','days_since_order']], on='order_id', how='left').merge(avg_freq_user[['user_id','avg_freq']], on='user_id', how='left')

In [54]:
del freq_df, avg_freq_user

In [55]:
orders_load.head()

Unnamed: 0,order_id,user_id,delivery_datetime,wait_time,promo_credit,order_price_total,promo_code,payment_method,days_since_order,avg_freq
0,12985250,6820,2024-02-17 02:44:36.270,75.62,30.0,143.0,friyay 02162024,Pin Debit,46.0,68.346154
1,8331652,543096,2021-01-15 20:25:46.120,78.51,2.55,99.52,lit15,ACH,183.0,166.285714
2,7735286,703170,2020-09-30 19:04:41.300,28.12,0.0,116.0,,Cash,53.0,87.631579
3,6531074,671495,2020-02-09 19:06:34.763,9.74,6.25,13.0,power2920,POB,31.0,35.25
4,12619333,671495,2023-10-30 00:02:39.087,39.57,10.5,60.88,spooky,ACH,4.0,35.25


# Key input: days of orders lookback

In [56]:
end_date = orders_load['delivery_datetime'].max()
#start_date = end_date - timedelta(days=4500000)

#orders_reduced = orders_load[orders_load['delivery_datetime']>=start_date]
orders_reduced = orders_load.copy()

In [57]:
del orders_load

In [58]:
users_load = pd.read_csv('/content/drive/MyDrive/users.csv', dtype=dtype_dict)

In [59]:
orders = orders_reduced.merge(users_load, on='user_id', how='left')

In [60]:
del users_load, orders_reduced

In [61]:
items_load = pd.read_csv('/content/drive/MyDrive/items.csv', dtype=dtype_dict)
items_load.drop('user_id', inplace=True, axis=1)

In [62]:
category_map = {'Accessories':'other',
                'Drops':'other',
                'Concentrates':'other',
                'Topicals':'other',
                'Prerolls':'prerolls',
                'Vaporizers':'vaporizers',
                'Flowers':'flower',
                'Edibles':'edibles'}

items_load['product_category'] = items_load['product_category'].map(category_map)

In [63]:
items_groupby = items_load.groupby(['order_id','product_category']).agg({'quantity':'sum'}).reset_index()

reshaped_items = (items_groupby.pivot(index='order_id',columns='product_category',values=['quantity']).reset_index())

In [64]:
reshaped_items.columns = reshaped_items.columns.droplevel(0)
reshaped_items = reshaped_items.reset_index()
reshaped_items = reshaped_items.rename(columns={'': 'order_id'})


reshaped_items.head()

product_category,index,order_id,edibles,flower,other,prerolls,vaporizers
0,0,10000000,3.0,,,,
1,1,10000001,,,,1.0,1.0
2,2,10000002,2.0,,,2.0,1.0
3,3,10000003,2.0,2.0,,,
4,4,10000004,,2.0,1.0,,


In [65]:
reshaped = reshaped_items[['order_id','edibles','flower','prerolls','vaporizers']]
reshaped.head()

product_category,order_id,edibles,flower,prerolls,vaporizers
0,10000000,3.0,,,
1,10000001,,,1.0,1.0
2,10000002,2.0,,2.0,1.0
3,10000003,2.0,2.0,,
4,10000004,,2.0,,


In [66]:
orders = orders.merge(reshaped, on='order_id', how='left')

In [67]:
orders = orders.drop('promo_code', axis=1)
orders['account_created_at'] = pd.to_datetime(orders['account_created_at'])
orders['birthdate'] = pd.to_datetime(orders['birthdate'])

In [68]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54565 entries, 0 to 54564
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   order_id              54565 non-null  string        
 1   user_id               54565 non-null  string        
 2   delivery_datetime     54565 non-null  datetime64[ns]
 3   wait_time             54552 non-null  float64       
 4   promo_credit          54565 non-null  float64       
 5   order_price_total     54565 non-null  float64       
 6   payment_method        54565 non-null  category      
 7   days_since_order      51905 non-null  float64       
 8   avg_freq              54565 non-null  float64       
 9   user_zipcode_current  54565 non-null  category      
 10  account_created_at    54565 non-null  datetime64[ns]
 11  birthdate             53979 non-null  datetime64[ns]
 12  gender                40664 non-null  category      
 13  edibles         

In [69]:
del items_load, items_groupby, reshaped_items, reshaped

In [70]:
orders['age_at_purchase'] = round(orders.delivery_datetime.dt.year - orders.birthdate.dt.year)

orders['account_age_at_purchase'] =  (orders.delivery_datetime.dt.year - orders.account_created_at.dt.year)*12+orders.delivery_datetime.dt.month - orders.account_created_at.dt.month

In [71]:
orders['weekday'] = orders.delivery_datetime.dt.dayofweek
orders['hour'] = orders.delivery_datetime.dt.hour
orders['month'] = orders.delivery_datetime.dt.month

In [72]:
payment_map = {'POB':'card',
               'ACH':'ach',
               'Cash':'cash',
               'Pin Debit':'card',
               'Online Debit':'card',
               'Account Credit':'promotion'}

orders.payment_method = orders.payment_method.map(payment_map)

In [73]:
orders['is_payment_ach'] = (orders['payment_method'] == 'ach').astype(int)
orders['is_payment_card'] = (orders['payment_method'] == 'card').astype(int)
orders['is_payment_cash'] = (orders['payment_method'] == 'cash').astype(int)
orders['is_payment_promotion'] = (orders['payment_method'] == 'promotion').astype(int)

In [74]:
orders['is_male'] = (orders['gender'] == 'Male').astype(int)
orders['is_female'] = (orders['gender'] == 'Female').astype(int)

In [75]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54565 entries, 0 to 54564
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   order_id                 54565 non-null  string        
 1   user_id                  54565 non-null  string        
 2   delivery_datetime        54565 non-null  datetime64[ns]
 3   wait_time                54552 non-null  float64       
 4   promo_credit             54565 non-null  float64       
 5   order_price_total        54565 non-null  float64       
 6   payment_method           54565 non-null  object        
 7   days_since_order         51905 non-null  float64       
 8   avg_freq                 54565 non-null  float64       
 9   user_zipcode_current     54565 non-null  category      
 10  account_created_at       54565 non-null  datetime64[ns]
 11  birthdate                53979 non-null  datetime64[ns]
 12  gender                   40664 n

In [76]:
fill_values = {
    'wait_time': orders['wait_time'].mean(),
    'days_since_order': orders['days_since_order'].mean(),
    'avg_freq': orders['avg_freq'].mean(),
    'edibles': 0,
    'flower': 0,
    'prerolls': 0,
    'vaporizers': 0,
    'age_at_purchase': orders['age_at_purchase'].mean()
    }

orders = orders.fillna(fill_values)

In [77]:
orders_final = orders.drop(['order_id','account_created_at','birthdate','gender','payment_method','wait_time'], axis=1).copy()

In [78]:
orders_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54565 entries, 0 to 54564
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   user_id                  54565 non-null  string        
 1   delivery_datetime        54565 non-null  datetime64[ns]
 2   promo_credit             54565 non-null  float64       
 3   order_price_total        54565 non-null  float64       
 4   days_since_order         54565 non-null  float64       
 5   avg_freq                 54565 non-null  float64       
 6   user_zipcode_current     54565 non-null  category      
 7   edibles                  54565 non-null  float64       
 8   flower                   54565 non-null  float64       
 9   prerolls                 54565 non-null  float64       
 10  vaporizers               54565 non-null  float64       
 11  age_at_purchase          54565 non-null  float64       
 12  account_age_at_purchase  54565 n

In [79]:
user_count = orders_final.user_id.nunique()
user_count

2660

# Deep Learning START

In [80]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    try:
        # Configure GPU memory growth
        for gpu in physical_devices:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled")

        # Set mixed precision policy
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
        print("Mixed precision policy set to float16")
    except RuntimeError as e:
        print(e)

Num GPUs Available:  1
GPU memory growth enabled
Mixed precision policy set to float16


In [81]:
def create_sequences(user_id, df, sequence_length=180):

    user_id = str(user_id)
    user_orders = df[df['user_id'].astype(str) == user_id].sort_values('delivery_datetime')

    features = ['promo_credit', 'order_price_total',
                'days_since_order', 'avg_freq', 'edibles', 'flower',
                'prerolls', 'vaporizers', 'age_at_purchase',
                'account_age_at_purchase', 'weekday', 'hour', 'month',
                'is_payment_ach', 'is_payment_card', 'is_payment_cash',
                'is_payment_promotion', 'is_male', 'is_female']

    sequences = user_orders[features].values

    X = []
    y = []

    if len(sequences) <= sequence_length:
        return np.array([]), np.array([])

    for i in range(len(sequences) - sequence_length):
        X.append(sequences[i:i+sequence_length])
        # Predict if order will occur in next 7 days (1) or not (0)
        next_order_time = user_orders.iloc[i+sequence_length]['delivery_datetime']
        week_after = user_orders.iloc[i+sequence_length-1]['delivery_datetime'] + timedelta(days=7)
        y.append(1 if next_order_time <= week_after else 0)

    if len(X) == 0:
        return np.array([]), np.array([])

    return np.array(X), np.array(y)

In [82]:
def create_batch_sequences(df, batch_size=1000, sequence_length=180):
    print(f"Starting function with df shape: {df.shape}")
    print(f"Memory usage of input df: {df.memory_usage().sum() / 1024 / 1024:.2f} MB")

    df = df.sort_values('delivery_datetime')

    features = ['promo_credit', 'order_price_total',
                'days_since_order', 'avg_freq', 'edibles', 'flower',
                'prerolls', 'vaporizers', 'age_at_purchase',
                'account_age_at_purchase', 'weekday', 'hour', 'month',
                'is_payment_ach', 'is_payment_card', 'is_payment_cash',
                'is_payment_promotion', 'is_male', 'is_female']

    end_dates = pd.date_range(
        start=df['delivery_datetime'].min() + pd.Timedelta(days=sequence_length),
        end=df['delivery_datetime'].max(),
        freq='7D'
    )
    print(f"Created {len(end_dates)} end dates")

    X_batches = []
    y_batches = []

    for i, end_date in enumerate(end_dates):
        batch_start_time = time.time()
        print(f"\nProcessing batch {i+1}/{len(end_dates)}")

        start_date = end_date - pd.Timedelta(days=sequence_length)
        print(f"Date range: {start_date} to {end_date}")

        # Filter data for this time window
        batch_data = df[
            (df['delivery_datetime'] > start_date) &
            (df['delivery_datetime'] <= end_date)
        ].copy()  # Create copy to avoid SettingWithCopyWarning

        print(f"Batch data shape: {batch_data.shape}")
        if len(batch_data) == 0:
            print("Empty batch, skipping...")
            continue

        batch_users = batch_data['user_id'].unique()
        print(f"Found {len(batch_users)} unique users in batch")

        # Pre-allocate lists with estimated size
        batch_X = []
        batch_y = []

        # Process users in smaller chunks
        chunk_size = 100
        for j in range(0, len(batch_users), chunk_size):
            chunk_start_time = time.time()
            user_chunk = batch_users[j:j + chunk_size]
            print(f"Processing users {j}-{j+len(user_chunk)}/{len(batch_users)}")

            for user_id in user_chunk:
                user_orders = batch_data[batch_data['user_id'] == user_id]

                if len(user_orders) <= sequence_length:
                    continue

                # Extract feature sequences
                sequences = user_orders[features].values

                # Create sequences for this user
                for k in range(len(sequences) - sequence_length):
                    seq = sequences[k:k+sequence_length]
                    next_order_time = user_orders.iloc[k+sequence_length]['delivery_datetime']
                    week_after = user_orders.iloc[k+sequence_length-1]['delivery_datetime'] + timedelta(days=7)
                    y_value = 1 if next_order_time <= week_after else 0

                    batch_X.append(seq)
                    batch_y.append(y_value)

            chunk_time = time.time() - chunk_start_time
            print(f"Chunk processing time: {chunk_time:.2f} seconds")
            print(f"Current batch_X length: {len(batch_X)}")

        if batch_X:
            X_batches.append(np.array(batch_X))
            y_batches.append(np.array(batch_y))
            print(f"Batch shape: {X_batches[-1].shape}")

        batch_time = time.time() - batch_start_time
        print(f"Total batch processing time: {batch_time:.2f} seconds")

    return X_batches, y_batches

In [83]:
def create_model(sequence_length, n_features):
    # Use CuDNNLSTM if GPU is available
    lstm_layer = layers.LSTM if tf.test.is_built_with_cuda() else layers.LSTM

    model = models.Sequential([
        lstm_layer(64, return_sequences=True,
                  input_shape=(sequence_length, n_features),
                  dtype='float16', # Use float16 for mixed precision
                   unit_forget_bias=True,
                   implementation=2),
        layers.Dropout(0.2),
        lstm_layer(32,
                   dtype = 'float16',
                   unit_forget_bias=True,
                   implementation=2),
        layers.Dropout(0.2),
        layers.Dense(16, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    # Use mixed precision optimizer
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)

    model.compile(optimizer=optimizer,
                 loss='binary_crossentropy',
                 metrics=['accuracy', tf.keras.metrics.AUC()])

    return model

In [84]:
def train_model_with_validation(orders_df, batch_size=1000, epochs=10):
    print("Starting training preparation...")
    # Sort data by datetime
    orders_df = orders_df.sort_values('delivery_datetime')

    # Split into train and validation
    train_cutoff = orders_df['delivery_datetime'].max() - timedelta(days=14)
    train_df = orders_df[orders_df['delivery_datetime'] <= train_cutoff]
    val_df = orders_df[orders_df['delivery_datetime'] > train_cutoff]

    print("Creating training sequences...")
    # Create sequences
    X_train_batches, y_train_batches = create_batch_sequences(train_df, batch_size)
    print("Creating validation sequences...")
    X_val_batches, y_val_batches = create_batch_sequences(val_df, batch_size)

    if not X_train_batches:
        raise ValueError("No training sequences were created. Check your data and sequence creation logic.")

    # Get dimensions for model creation
    sequence_length = X_train_batches[0].shape[1]
    n_features = X_train_batches[0].shape[2]

    print(f"Sequence length: {sequence_length}, Number of features: {n_features}")

    # Create and train model
    print("Creating model...")
    model = create_model(sequence_length, n_features)
    history = {'loss': [], 'val_loss': [], 'accuracy': [], 'val_accuracy': [],
               'auc': [], 'val_auc': []}

    for epoch in range(epochs):
        print(f'Epoch {epoch+1}/{epochs}')
        epoch_history = {'loss': [], 'accuracy': [], 'auc': []}

        # Train on batches
        for i in range(len(X_train_batches)):
            batch_history = model.train_on_batch(X_train_batches[i],
                                               y_train_batches[i],
                                               return_dict=True)
            for key in epoch_history.keys():
                epoch_history[key].append(batch_history[key])
            print(f"Batch {i+1}/{len(X_train_batches)} - loss: {batch_history['loss']:.4f}")

        # Calculate validation metrics
        val_metrics = {'val_loss': [], 'val_accuracy': [], 'val_auc': []}
        for i in range(len(X_val_batches)):
            val_batch_metrics = model.test_on_batch(X_val_batches[i],
                                                  y_val_batches[i],
                                                  return_dict=True)
            for key in val_metrics.keys():
                val_metrics[key].append(val_batch_metrics[key[4:]])  # Remove 'val_' prefix

        # Update history with epoch averages
        for key in epoch_history.keys():
            history[key].append(np.mean(epoch_history[key]))
        for key in val_metrics.keys():
            history[key].append(np.mean(val_metrics[key]))

        # Print epoch metrics
        print(f'loss: {history["loss"][-1]:.4f} - accuracy: {history["accuracy"][-1]:.4f} - '
              f'auc: {history["auc"][-1]:.4f} - val_loss: {history["val_loss"][-1]:.4f} - '
              f'val_accuracy: {history["val_accuracy"][-1]:.4f} - val_auc: {history["val_auc"][-1]:.4f}')

    return model, history

In [87]:
model, history = train_model_with_validation(orders_final, batch_size=1000, epochs=10)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing users 500-600/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 600-700/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 700-800/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 800-900/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 900-1000/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 1000-1100/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 1100-1200/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 1200-1300/1392
Chunk processing time: 0.15 seconds
Current batch_X length: 0
Processing users 1300-1392/1392
Chunk processing time: 0.14 seconds
Current batch_X length: 0
Total batch processing time: 2.10 seconds

Processing batch 118/237
Date range: 2022-03-30 00:27:2

ValueError: No training sequences were created. Check your data and sequence creation logic.

In [None]:
def plot_training_history(history):
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

    # Plot loss
    ax1.plot(history['loss'], label='Training Loss')
    ax1.plot(history['val_loss'], label='Validation Loss')
    ax1.set_title('Model Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()

    # Plot accuracy
    ax2.plot(history['accuracy'], label='Training Accuracy')
    ax2.plot(history['val_accuracy'], label='Validation Accuracy')
    ax2.set_title('Model Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()

    # Plot AUC
    ax3.plot(history['auc'], label='Training AUC')
    ax3.plot(history['val_auc'], label='Validation AUC')
    ax3.set_title('Model AUC')
    ax3.set_xlabel('Epoch')
    ax3.set_ylabel('AUC')
    ax3.legend()

    plt.tight_layout()
    plt.show()

In [None]:
plot_training_history(history)

Time Series Analysis with a pre build Keras model