# Importing Necessary Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import pickle
from collections import defaultdict

# Data Processing

In [3]:
def preprocess_dataset(dataset_name, base_path, output_dir):
    print(f"\n=== Processing {dataset_name.upper()} Dataset ===")
    sessions, item_counts = load_and_create_sessions(dataset_name, base_path)
    indexed_sessions, item_mapping = filter_and_index_sessions(sessions, item_counts)
    train_X, train_Y, test_X, test_Y, train_sessions = split_and_format_sessions(indexed_sessions)
    save_data(output_dir, train_X, train_Y, test_X, test_Y, train_sessions, item_mapping)

## Create Sessions

In [4]:
def load_and_create_sessions(dataset_name, base_path, session_timeout_ms=30 * 60 * 1000):
    if dataset_name == 'diginetica':
        df = pd.read_csv(os.path.join(base_path, 'train-item-views.csv'), sep=';')
        df.rename(columns={'sessionId': 'visitorid', 'itemId': 'itemid', 'timeframe': 'timestamp'}, inplace=True)

    elif dataset_name == 'retailrocket':
        df = pd.read_csv(os.path.join(base_path, 'events.csv'), header=None,
                         names=['timestamp', 'visitorid', 'event', 'itemid', 'transactionid'])
        df = df[1:]  # skip header row
        df = df[df['event'] == 'view']

    else:
        raise ValueError(f"Unsupported dataset: {dataset_name}")

    df['timestamp'] = df['timestamp'].astype(int)
    df['visitorid'] = df['visitorid'].astype(str)
    df = df.sort_values(['visitorid', 'timestamp'])

    # 30-minute session timeout
    session_data = []
    item_counts = defaultdict(int)

    print(f"Creating sessions for {dataset_name} using 30-minute timeout...")
    for visitor_id, group in df.groupby('visitorid'):
        group = group.sort_values('timestamp')
        last_time = group.iloc[0]['timestamp']
        current_session = []

        for _, row in group.iterrows():
            if row['timestamp'] - last_time > session_timeout_ms:
                if len(current_session) > 1:
                    session_data.append(current_session)
                    for item in current_session:
                        item_counts[item] += 1
                current_session = []
            current_session.append(int(row['itemid']))
            last_time = row['timestamp']

        if len(current_session) > 1:
            session_data.append(current_session)
            for item in current_session:
                item_counts[item] += 1

    print(f"Initial sessions formed: {len(session_data)}")
    return session_data, item_counts

## Filtering

In [5]:
def filter_and_index_sessions(session_data, item_counts, min_item_freq=6):
    items_to_keep = {item for item, count in item_counts.items() if count >= min_item_freq}

    filtered_sessions = []
    for session in session_data:
        filtered = [item for item in session if item in items_to_keep]
        if len(filtered) > 1:
            filtered_sessions.append(filtered)

    item_mapping = {item: idx + 1 for idx, item in enumerate(sorted(items_to_keep))}
    indexed_sessions = [[item_mapping[item] for item in session] for session in filtered_sessions]

    print(f"Sessions after filtering: {len(filtered_sessions)}")
    print(f"Unique items kept: {len(item_mapping)}")
    return indexed_sessions, item_mapping

## Splitting

In [6]:
def split_and_format_sessions(indexed_sessions):
    np.random.seed(42)
    indices = np.arange(len(indexed_sessions))
    np.random.shuffle(indices)
    train_cutoff = int(0.8 * len(indices))
    train_indices = indices[:train_cutoff]
    test_indices = indices[train_cutoff:]

    train_sessions = [indexed_sessions[i] for i in train_indices]
    test_sessions = [indexed_sessions[i] for i in test_indices]

    def create_xy(sessions):
        X, Y = [], []
        for session in sessions:
            for i in range(1, len(session)):
                X.append(session[:i])
                Y.append(session[i])
        return X, Y

    train_X, train_Y = create_xy(train_sessions)
    test_X, test_Y = create_xy(test_sessions)

    return train_X, train_Y, test_X, test_Y, train_sessions

## Save the data

In [7]:
def save_data(output_dir, train_X, train_Y, test_X, test_Y, train_sessions, item_mapping):
    os.makedirs(output_dir, exist_ok=True)
    with open(os.path.join(output_dir, 'train.txt'), 'wb') as f:
        pickle.dump((train_X, train_Y), f)
    with open(os.path.join(output_dir, 'test.txt'), 'wb') as f:
        pickle.dump((test_X, test_Y), f)
    with open(os.path.join(output_dir, 'all_train_seq.txt'), 'wb') as f:
        pickle.dump(train_sessions, f)

    print(f"Train samples: {len(train_X)} | Test samples: {len(test_X)} | Items: {len(item_mapping)}")
    print(f"Data saved to {output_dir}")

# Run the pipeline

In [8]:
preprocess_dataset('diginetica', '/kaggle/input/diginetica-dataset/', './datasets/diginetica/')


=== Processing DIGINETICA Dataset ===
Creating sessions for diginetica using 30-minute timeout...
Initial sessions formed: 219630
Sessions after filtering: 201580
Unique items kept: 37866
Train samples: 611113 | Test samples: 152576 | Items: 37866
Data saved to ./datasets/diginetica/


In [9]:
preprocess_dataset('retailrocket', '/kaggle/input/ecommerce-dataset/', './datasets/retailrocket/')


=== Processing RETAILROCKET Dataset ===


  df = pd.read_csv(os.path.join(base_path, 'events.csv'), header=None,


Creating sessions for retailrocket using 30-minute timeout...
Initial sessions formed: 367652
Sessions after filtering: 297244
Unique items kept: 42964
Train samples: 604751 | Test samples: 149162 | Items: 42964
Data saved to ./datasets/retailrocket/
