
## graph based session intelligence

In [1]:
from genericpath import exists
import kagglehub

if not exists("data/.kaggle/events.csv"):
    path = kagglehub.dataset_download("retailrocket/ecommerce-dataset")
else:
    path = "data/.kaggle/events.csv"
print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/nahomnadew/.cache/kagglehub/datasets/retailrocket/ecommerce-dataset/versions/2


In [2]:
#initial parameters

DATA_PATH = "data/kaggle/events.csv"  
SAMPLE_MAX_SESSIONS = 200000  
MIN_SESSION_LENGTH = 2  
NEO4J_URI = "bolt://localhost:7687"
NEO4J_AUTH = ("neo4j", "nahi1420")  
BATCH_SIZE = 5000  


In [3]:
#importing necessary libraries
import os
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from tqdm import tqdm
from neo4j import GraphDatabase
from sklearn.model_selection import train_test_split

print("pandas", pd.__version__)

pandas 2.3.3


In [4]:
# Load dataset 
assert os.path.exists(DATA_PATH), f"Data file not found: {DATA_PATH}"
print("Loading, this may take a while for the full RetailRocket dataset...")

# The RetailRocket 'events.csv' has columns similar to: sessionId, itemId, eventType, timestamp
usecols = None  
df = pd.read_csv(DATA_PATH, parse_dates=['timestamp'], low_memory=False)
print("Loaded rows:", len(df))
print("Columns:", df.columns.tolist())
df.head()

Loading, this may take a while for the full RetailRocket dataset...


  df = pd.read_csv(DATA_PATH, parse_dates=['timestamp'], low_memory=False)


Loaded rows: 2756101
Columns: ['timestamp', 'visitorid', 'event', 'itemid', 'transactionid']


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [None]:
# --- 1. Initial Column Renaming and Setup ---


cols = [c.lower() for c in df.columns]
colmap = {}

# Map visitorid to user_id, NOT session_id
if 'visitorid' in cols:
    colmap[[c for c in df.columns if c.lower()=='visitorid'][0]] = 'user_id' 
if 'itemid' in cols:
    colmap[[c for c in df.columns if c.lower()=='itemid'][0]] = 'event_id'
if 'event' in cols:
    colmap[[c for c in df.columns if c.lower()=='event'][0]] = 'event_type'
if 'timestamp' in cols:
    colmap[[c for c in df.columns if c.lower()=='timestamp'][0]] = 'timestamp'

df = df.rename(columns=colmap)
print('Renamed columns mapping:', colmap)
assert 'user_id' in df.columns and 'event_id' in df.columns and 'timestamp' in df.columns

# Convert timestamp to datetime and sort by user and time
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')
df = df.sort_values(['user_id','timestamp'])


# --- 2. The CRITICAL Sessionization Step ---

df['time_diff'] = df.groupby('user_id')['timestamp'].diff()

MAX_SESSION_GAP = pd.Timedelta('30 minutes')


df['new_session'] = (df['time_diff'].isnull()) | (df['time_diff'] > MAX_SESSION_GAP)

# Create the session_id by cumulatively summing the 'new_session' flag and grouping by user
df['session_id'] = df.groupby('user_id')['new_session'].cumsum().astype(str) + '_' + df['user_id'].astype(str)

# Add step_index per session (order by timestamp)
df['step_index'] = df.groupby('session_id').cumcount() + 1
df = df.drop(columns=['time_diff', 'new_session']) # Cleanup intermediate columns



# Filter out sessions that are too short (length < MIN_SESSION_LENGTH)
session_lengths = df.groupby('session_id').size()
valid_sessions = session_lengths[session_lengths >= MIN_SESSION_LENGTH].index
df = df[df['session_id'].isin(valid_sessions)].copy()

print("\n--- Sessionization Complete ---")
print(f"Total unique VISITOR IDs: {df['user_id'].nunique()}")
print(f"Total unique SESSIONS (after 30-min gap rule): {df['session_id'].nunique()}")
print(f"After filtering short sessions (<{MIN_SESSION_LENGTH}): {len(df)} rows")

Renamed columns mapping: {'visitorid': 'session_id', 'itemid': 'event_id', 'event': 'event_type', 'timestamp': 'timestamp'}
After filtering short sessions: 406020 unique sessions, 1754541 rows


In [6]:
# pick the first N sessions (keeps temporal order inside sessions)
if SAMPLE_MAX_SESSIONS is not None:
    unique_sess = df['session_id'].drop_duplicates().iloc[:SAMPLE_MAX_SESSIONS]
    df = df[df['session_id'].isin(unique_sess)].copy()
    print("Sampled sessions:", df['session_id'].nunique(), "rows:", len(df))

Sampled sessions: 200000 rows: 869349


In [7]:
# Save a sampled CSV for quick checks 
sample_out = "data/sampled_sessions.csv"
df.to_csv(sample_out, index=False)
print("Saved sample to", sample_out)

Saved sample to data/sampled_sessions.csv


In [8]:
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

  df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')


In [9]:
#Check Data Range and Determine Cutoff 
print("\n--- Data Range Check ---")
min_ts = df['timestamp'].min()
max_ts = df['timestamp'].max()
print(f"First event timestamp: {min_ts}")
print(f"Last event timestamp: {max_ts}")




--- Data Range Check ---
First event timestamp: 2015-05-03 03:00:04.384000
Last event timestamp: 2015-09-18 02:57:40.810000


In [10]:
total_duration = df['timestamp'].max() - df['timestamp'].min()
SEVEN_DAYS_DELTA = pd.Timedelta(days=7)
SPLIT_CUTOFF_DATE = df['timestamp'].max() - SEVEN_DAYS_DELTA

df_train = df[df['timestamp'] < SPLIT_CUTOFF_DATE].copy()
df_test = df[df['timestamp'] >= SPLIT_CUTOFF_DATE].copy()


print(f"Split Cutoff Date: {SPLIT_CUTOFF_DATE}")
print(f"Train sessions: {df_train['session_id'].nunique()} (Rows: {len(df_train)})")
print(f"Test sessions: {df_test['session_id'].nunique()} (Rows: {len(df_test)})")

Split Cutoff Date: 2015-09-11 02:57:40.810000
Train sessions: 193183 (Rows: 834594)
Test sessions: 11074 (Rows: 34755)


In [None]:
# Neo4j connection helper
def get_driver(uri=NEO4J_URI, auth=NEO4J_AUTH):
    return GraphDatabase.driver(uri, auth=auth, encrypted=False)

def create_constraints(driver):
    with driver.session() as session:
        session.execute_write(lambda tx: tx.run("""CREATE CONSTRAINT IF NOT EXISTS FOR (e:Event) REQUIRE e.id IS UNIQUE"""))
        session.execute_write(lambda tx: tx.run("""CREATE CONSTRAINT IF NOT EXISTS FOR (s:Session) REQUIRE s.id IS UNIQUE"""))
    print('Constraints ensured.')

def build_graph_from_df(df, driver, batch_size=BATCH_SIZE):
    # df must contain: session_id, timestamp, step_index, event_id, event_type (optional)
    create_constraints(driver)
    sessions = df['session_id'].drop_duplicates().tolist()
    print('Building nodes for', len(sessions), 'sessions (batched)')
    with driver.session() as session:
        for i in tqdm(range(0, len(sessions), batch_size), desc='sessions'):
            batch = sessions[i:i+batch_size]
            params = {'sids': batch}
            cypher = """UNWIND $sids AS sid
            MERGE (s:Session {id: sid})
            """
            session.run(cypher, **params)
    print('Creating Event nodes and OCCURRED_IN relationships (this may take a while)')
    with driver.session() as session:
        for sid, group in tqdm(df.groupby('session_id'), desc='sessions_events', total=df['session_id'].nunique()):
            g = group.sort_values('step_index')

            start_ts = g['timestamp'].iloc[0].isoformat() if not pd.isnull(g['timestamp'].iloc[0]) else None
            end_ts = g['timestamp'].iloc[-1].isoformat() if not pd.isnull(g['timestamp'].iloc[-1]) else None
            session.run("""MATCH (s:Session {id:$sid}) SET s.start_ts=$start_ts, s.end_ts=$end_ts, s.num_events=$num_events""", sid=sid, start_ts=start_ts, end_ts=end_ts, num_events=len(g))

            for r in g.itertuples():
                eid = str(r.event_id)
                etype = getattr(r, 'event_type', None) if 'event_type' in g.columns else None
                session.run("""MERGE (e:Event {id:$eid})
                ON CREATE SET e.type = $etype, e.global_count = 1
                ON MATCH SET e.global_count = coalesce(e.global_count,0) + 1
                """, eid=eid, etype=etype)
                session.run("""MATCH (e:Event {id:$eid}), (s:Session {id:$sid})
                MERGE (e)-[r:OCCURRED_IN {index:$idx, ts:$ts}]->(s)
                """, eid=eid, sid=sid, idx=int(r.step_index), ts=r.timestamp.isoformat() if not pd.isnull(r.timestamp) else None)
    print('Event nodes and OCCURRED_IN relationships created.')

# --- RE-DEFINE build_next_edges to include time ---

def build_next_edges(df_train, driver, batch_size=2000):
    """
    Calculates item-to-item transitions and creates NEXT relationships,
    now including transition count and average time delta (duration).
    """
    print("Building NEXT relationships with temporal properties...")
    
    # 1. Aggregate transitions and time delta in Pandas
    
    # Identify the next event in the session
    df_transitions = df_train.copy()
    df_transitions['next_itemid'] = df_transitions.groupby('session_id')['event_id'].shift(-1)
    df_transitions['next_timestamp'] = df_transitions.groupby('session_id')['timestamp'].shift(-1)
    
    # Filter out the last event in each session (where next_itemid is NaN)
    df_transitions.dropna(subset=['next_itemid'], inplace=True)
    df_transitions['next_itemid'] = df_transitions['next_itemid'].astype(int)
    
    # Calculate the time delta in seconds (or milliseconds, depending on unit='ms' conversion)
    df_transitions['time_delta_ms'] = (df_transitions['next_timestamp'] - df_transitions['timestamp']).dt.total_seconds() * 1000
    
    # Group and aggregate
    transition_summary = df_transitions.groupby(['event_id', 'next_itemid']).agg(
        count=('session_id', 'size'),
        avg_duration_ms=('time_delta_ms', 'mean')
    ).reset_index()

    # 2. Write transitions to Neo4j
    
    cypher_query = """
    UNWIND $batch AS row
    MATCH (e1:Event {id: toString(row.event_id)})
    MATCH (e2:Event {id: toString(row.next_itemid)})
    MERGE (e1)-[r:NEXT]->(e2)
    ON CREATE SET r.count = row.count, r.avg_duration_ms = row.avg_duration_ms
    ON MATCH SET r.count = r.count + row.count, r.avg_duration_ms = row.avg_duration_ms
    """
    
    transitions = transition_summary.to_dict('records')
    
    print(f"Total unique transitions to process: {len(transitions)}")
    with driver.session() as session:
        for i in tqdm(range(0, len(transitions), batch_size), desc="Writing NEXT edges"):
            batch = transitions[i:i + batch_size]
            session.execute_write(lambda tx: tx.run(cypher_query, batch=batch))



In [12]:
driver = get_driver()
build_graph_from_df(df_train, driver)
build_next_edges(df_train, driver)
driver.close()
print('Graph build functions ready. To run: instantiate driver and call build_graph_from_df_train(...)')

Constraints ensured.
Building nodes for 193183 sessions (batched)


sessions: 100%|██████████| 39/39 [00:00<00:00, 62.84it/s]


Creating Event nodes and OCCURRED_IN relationships (this may take a while)


sessions_events:  41%|████▏     | 79876/193183 [10:56<15:30, 121.71it/s] 


KeyboardInterrupt: 

In [None]:
def recommend_next_transition(driver, last_event_id, topk=10):
    q = """MATCH (e:Event {id:$last})-[r:NEXT]->(cand:Event)
    RETURN cand.id AS id, r.count AS score
    ORDER BY r.count DESC
    LIMIT $k
    """
    with driver.session() as s:
        res = s.run(q, last=last_event_id, k=topk)
        return [(r['id'], r['score']) for r in res]

driver = get_driver()
print(recommend_next_transition(driver, "12345", topk=5))
driver.close()

[('92233', 5)]


In [None]:
#  TEMPORAL TRANSITION MODEL (RECENCY DECAY) 

def recommend_temporal_transition(driver, last_event_id, topk=10):
    """
    Recommends based on a Temporal Transition Score, penalizing long average durations.
    Score: r.count / log(r.avg_duration_ms + 1)
    
    The log-based function prioritizes transitions that happened quickly (low avg_duration_ms).
    """
    query = """
    MATCH (e:Event {id:$last})-[r:NEXT]->(cand:Event)
    WITH cand.id AS id, r.count AS count, r.avg_duration_ms AS duration
    
    
    WITH id, 
         count, 
         duration,
         CASE 
             WHEN duration IS NULL OR duration <= 0 THEN count // Fallback for 0 or null duration
             ELSE count / log(duration + 1)
         END AS temporal_score
         
    RETURN id, temporal_score AS score
    ORDER BY score DESC
    LIMIT $k
    """
    with driver.session() as s:
        res = s.run(query, last=str(last_event_id), k=topk)
        return [(r['id'], r['score']) for r in res]

driver = get_driver()
# Example with the Temporal Model
last_event = '461686' 
print(f"Temporal Model Recs for {last_event}:")
print(recommend_temporal_transition(driver, last_event, topk=5))
driver.close()

Temporal Model Recs for 461686:
[('461686', 139.0786297097485), ('218794', 30.529432322018035), ('171878', 24.171619993465768), ('32581', 11.11591825243792), ('10572', 9.623820130996977)]


In [None]:
# WEIGHTED TEMPORAL TRANSITION MODEL (DIFFERENTIAL ACTION WEIGHTING) 

def recommend_weighted_temporal_transition(driver, last_event_id, topk=10):
    """
    Recommends by combining Temporal Decay and Differential Action Weighting.
    Weights: transaction=5.0, addtocart=3.0, view=1.0 (assumes e.type property exists)
    Score: (r.count * action_weight) / log(r.avg_duration_ms + 1)
    """
    query = """
    MATCH (e:Event {id:$last})-[r:NEXT]->(cand:Event)
    
    // 1. Define the action weight based on the type of the last event (e)
    WITH cand.id AS id, r.count AS count, r.avg_duration_ms AS duration, e.type AS last_type,
         CASE e.type
             WHEN 'transaction' THEN 5.0
             WHEN 'addtocart'   THEN 3.0
             ELSE 1.0 // Default for 'view' or other types
         END AS action_weight
    
    // 2. Calculate the weighted count
    WITH id, duration, (count * action_weight) AS weighted_count, last_type
    
    // 3. Apply the Recency Decay (Temporal Score) logic
    // The final score is the weighted frequency scaled by the recency decay (log(duration + 1))
    WITH id, 
         weighted_count, 
         duration,
         last_type,
         CASE 
             WHEN duration IS NULL OR duration <= 0 THEN weighted_count // Fallback
             ELSE weighted_count / log(duration + 1)
         END AS temporal_score
         
    RETURN id, temporal_score AS score, last_type
    ORDER BY score DESC
    LIMIT $k
    """
    with driver.session() as s:
        res = s.run(query, last=str(last_event_id), k=topk)
        return [(r['id'], r['score']) for r in res]

driver = get_driver()
# Example with the Weighted Temporal Model
last_event = '461686' 
print(f"Weighted Temporal Model Recs for {last_event}:")
print(recommend_weighted_temporal_transition(driver, last_event, topk=5))
driver.close()

In [None]:
def find_a_b_a_patterns(driver, event_A_id, event_B_id, limit=5):
    """
    Finds sessions where event A is followed by event B, which is then 
    followed by event A again (A -> B -> A pattern). Uses the OCCURRED_IN
    relationship's index property to ensure order.
    """
    
    query = """
    MATCH (a1:Event {id: $event_A_id})<-[r_a1:OCCURRED_IN]-(s:Session)
    MATCH (s)<-[r_b:OCCURRED_IN]-(b:Event {id: $event_B_id})
    MATCH (s)<-[r_a2:OCCURRED_IN]-(a2:Event {id: $event_A_id})
    
    // Ensure the sequence is A -> B -> A based on step_index
    WHERE r_a1.index < r_b.index AND r_b.index < r_a2.index
    
    // Aggregate by session and find the first instance of this pattern
    WITH s, min(r_a1.index) AS start_index
    
    RETURN s.id AS session_id, start_index
    ORDER BY start_index DESC
    LIMIT $limit
    """
    with driver.session() as session:
        result = session.run(query, event_A_id=str(event_A_id), 
                                     event_B_id=str(event_B_id), limit=limit)
        return [(r['session_id'], r['start_index']) for r in result]

driver = get_driver()
# Example: Using the two most popular events found in Cell 16: '461686' and '257040'
print("A -> B -> A Pattern (Session ID, Start Step Index):")
print(find_a_b_a_patterns(driver, '461686', '257040', limit=3))
driver.close()

A -> B -> A Pattern (Session ID, Start Step Index):
[]


In [None]:
def reciprocal_rank(true_id, ranked_list):
    try:
        pos = ranked_list.index(true_id)
        return 1.0 / (pos + 1)
    except ValueError:
        return 0.0

def evaluate_mrr(df_eval, driver, recommend_func, topk=50):
    """
    Evaluates MRR for a given recommendation function on a test DataFrame.
    The function is updated to pass the session history for multi-step models.
    """
    rr_scores = []
    
    for sid, group in tqdm(df_eval.groupby('session_id'), desc='Evaluating sessions', total=df_eval['session_id'].nunique()):
        g = group.sort_values('step_index')
        events = list(g['event_id'].astype(str))
        
        # Iterate over all transitions (event 'a' followed by event 'b')
        for i, b in enumerate(events[1:]):
            
            # The current event 'a' is at index i, history is events up to and including 'a'
            current_history = events[:i+1] 
            last_event_id = current_history[-1]
            
            if recommend_func.__name__ == 'recommend_next_transition':
                recs = recommend_func(driver, last_event_id, topk=topk)
            elif recommend_func.__name__ == 'recommend_next_popularity':
                # Popularity ignores the history
                recs = recommend_func(driver, None, topk=topk)
            else:
                 # Pass the *current history* for advanced models like multi-hop
                recs = recommend_func(driver, current_history, topk=topk) 
                
            ranked_ids = [r[0] for r in recs]
            rr_scores.append(reciprocal_rank(str(b), ranked_ids))
            
    return np.mean(rr_scores) if rr_scores else 0.0

# Re-evaluate the Transition Model with the updated function signature
driver = get_driver()
print('Starting MRR evaluation on TEST data for Simple Transition Model...')
mrr_transition = evaluate_mrr(df_test, driver, recommend_next_transition, topk=50)
print('MRR (Simple Transition, Top 50) on TEST data:', mrr_transition)
driver.close()

Starting MRR evaluation on TEST data for Simple Transition Model...


Evaluating sessions: 100%|██████████| 11074/11074 [00:55<00:00, 198.28it/s]

MRR (Simple Transition, Top 50) on TEST data: 0.4837387297407908





### implemeting and evaluating comparison methods

In [None]:
def recommend_next_popularity(driver, last_event_id, topk=50):
    """
    Recommends the top K most globally frequent events.
    This serves as a non-personalized baseline.
    The 'last_event_id' argument is ignored for this baseline.
    """
    query = """
    MATCH (e:Event)
    RETURN e.id AS event_id, e.global_count AS score
    ORDER BY score DESC
    LIMIT $topk
    """
    with driver.session() as session:
        result = session.run(query, topk=topk)
        return [(r['event_id'], r['score']) for r in result]

driver = get_driver()
top_items = recommend_next_popularity(driver, '12345', topk=5)
print("Popularity Baseline Example:", top_items)
driver.close()

Popularity Baseline Example: [('461686', 6203), ('257040', 3646), ('309778', 3426), ('219512', 3253), ('320130', 2924)]


In [None]:
def recommend_multi_hop_co_occurrence(driver, event_history_ids, topk=10, max_depth=3):
    """
    Recommends events based on their overall transition popularity 
    from *any* event in the recent history (up to max_depth).
    This leverages the global graph structure for contextual insight.
    """
    if not event_history_ids:
        return recommend_next_popularity(driver, None, topk=topk) # Fallback to popularity
        
    # Take only the N most recent events from the history
    recent_history = event_history_ids[-max_depth:] 
    
    # Query: Find candidates connected via :NEXT to *any* event in the recent history
    query = """
    UNWIND $recent_ids AS history_event_id
    MATCH (e_hist:Event {id: history_event_id})-[r:NEXT]->(candidate:Event)
    
    // Exclude events already in the session history
    WHERE NOT candidate.id IN $all_history_ids
    
    WITH candidate, sum(r.count) AS aggregated_score
    RETURN candidate.id AS id, aggregated_score AS score
    ORDER BY score DESC
    LIMIT $topk
    """
    
    with driver.session() as session:
        # Pass the entire history to the query for exclusion purposes
        result = session.run(query, recent_ids=[str(e) for e in recent_history], 
                                     all_history_ids=[str(e) for e in event_history_ids],
                                     topk=topk)
        return [(r['id'], r['score']) for r in result]

driver = get_driver()
# Example: Simulate a history of 3 events (e.g., [123, 456, 789])
test_history = ['461686', '257040', '309778'] 
print(f"Multi-Hop Co-Occurrence Example for history {test_history}:")
print(recommend_multi_hop_co_occurrence(driver, test_history, topk=5))
driver.close()

Multi-Hop Co-Occurrence Example for history ['461686', '257040', '309778']:
[('218794', 545), ('171878', 387), ('32581', 186), ('10572', 160), ('360487', 160)]


In [None]:
# Comprehensive Evaluation on df_test 
driver = get_driver()
topk_limit = 50

# 1. Evaluating Popularity Baseline
print("1. Evaluating Popularity Baseline...")
mrr_popularity = evaluate_mrr(df_test, driver, recommend_next_popularity, topk=topk_limit)
print(f"MRR (Popularity Baseline, Top {topk_limit}): {mrr_popularity:.5f}")


# 2. Simple Transition Model 
print("\n2. Evaluating Simple Transition Model (1-Hop)...")
mrr_transition = evaluate_mrr(df_test, driver, recommend_next_transition, topk=topk_limit)
print(f"MRR (Simple Transition, Top {topk_limit}): {mrr_transition:.5f}")


# 3. Multi-Hop Co-Occurrence Model 
print("\n3. Evaluating Multi-Hop Co-Occurrence Model...")
mrr_multi_hop = evaluate_mrr(df_test, driver, recommend_multi_hop_co_occurrence, topk=topk_limit)
print(f"MRR (Multi-Hop Co-Occurrence, Top {topk_limit}): {mrr_multi_hop:.5f}")


# 4. Temporal Transition Model (Recency Decay) - Unweighted
print("\n4. Evaluating Temporal Transition Model (Recency Decay - Unweighted)...")
mrr_temporal = evaluate_mrr(df_test, driver, recommend_temporal_transition, topk=topk_limit)
print(f"MRR (Temporal Transition, Top {topk_limit}): {mrr_temporal:.5f}")


# 5. Weighted Temporal Transition Model
print("\n5. NEW: Evaluating Weighted Temporal Transition Model (Intent + Recency)...")
mrr_weighted_temporal = evaluate_mrr(df_test, driver, recommend_weighted_temporal_transition, topk=topk_limit)
print(f"MRR (Weighted Temporal Transition, Top {topk_limit}): {mrr_weighted_temporal:.5f}")

driver.close()

# Summary Report
print("\n--- Final MRR Comparison (on TEST Data) ---")
print(f"| Model | MRR@{topk_limit} |")
print("| :--- | :--- |")
print(f"| Popularity Baseline | {mrr_popularity:.5f} |")
print(f"| Simple Transition | {mrr_transition:.5f} |")
print(f"| Multi-Hop Co-Occurrence | {mrr_multi_hop:.5f} |")
print(f"| Temporal Transition (Recency Decay) | {mrr_temporal:.5f} |")
print(f"| **Weighted Temporal (Intent + Recency)** | **{mrr_weighted_temporal:.5f}** |")

1. Evaluating Popularity Baseline...


Evaluating sessions: 100%|██████████| 11074/11074 [11:05<00:00, 16.63it/s] 


MRR (Popularity Baseline, Top 50): 0.00359

2. Evaluating Simple Transition Model (1-Hop)...


Evaluating sessions: 100%|██████████| 11074/11074 [00:20<00:00, 535.68it/s]


MRR (Simple Transition, Top 50): 0.48374

3. Evaluating Multi-Hop Co-Occurrence Model...


Evaluating sessions: 100%|██████████| 11074/11074 [00:41<00:00, 269.53it/s]


MRR (Multi-Hop Co-Occurrence, Top 50): 0.15834

4. New: Evaluating Temporal Transition Model (Recency Decay)...


Evaluating sessions: 100%|██████████| 11074/11074 [00:14<00:00, 789.31it/s]

MRR (Temporal Transition, Top 50): 0.00000

--- Final MRR Comparison (on TEST Data) ---
| Model | MRR@50 |
| :--- | :--- |
| Popularity Baseline | 0.00359 |
| Simple Transition | 0.48374 |
| Multi-Hop Co-Occurrence | 0.15834 |
| **Temporal Transition (Recency Decay)** | **0.00000** |





### dashboard implementation

In [None]:
# --- 1. SETUP AND DATA EXTRACTION FOR DASHBOARD ---

import plotly.express as px
import pandas as pd
from collections import defaultdict
print("Plotly, Pandas, and defaultdict are available.")


# --- Helper Function to Run Cypher Queries ---
def run_cypher_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        # Convert results to a list of dictionaries for easy plotting
        return [r.data() for r in result]

driver = get_driver()

# --- Data Extraction Queries (Define the Variables) ---

# 1. Top 10 Most Popular Events
print("\n--- Extracting Top 10 Most Popular Events ---")
query_top_events = """
MATCH (e:Event)
RETURN e.id AS eventId, e.global_count AS count
ORDER BY count DESC
LIMIT 10
"""
top_events_data = run_cypher_query(driver, query_top_events)
print(f"Extracted {len(top_events_data)} top events.")


# 2. Average Duration of Transitions (Distribution)
print("\n--- Extracting Average Transition Duration Distribution ---")
query_avg_duration_distribution = """
MATCH ()-[r:NEXT]->()
RETURN 
    CASE 
        WHEN r.avg_duration_ms < 1000 THEN 'A) < 1 sec'
        WHEN r.avg_duration_ms < 10000 THEN 'B) 1-10 sec'
        WHEN r.avg_duration_ms < 60000 THEN 'C) 10-60 sec'
        ELSE 'D) > 60 sec'
    END AS duration_bin, 
    count(r) AS num_transitions
ORDER BY duration_bin
"""
duration_data = run_cypher_query(driver, query_avg_duration_distribution)
print(f"Extracted {len(duration_data)} duration bins.")


# 3. Event Type Popularity
print("\n--- Extracting Event Type Popularity ---")
query_event_type_counts = """
MATCH (e:Event)
WHERE e.type IS NOT NULL
RETURN e.type AS eventType, sum(e.global_count) AS totalCount
ORDER BY totalCount DESC
"""
event_type_data = run_cypher_query(driver, query_event_type_counts)
print(f"Extracted {len(event_type_data)} event types.")

driver.close()

# --- 2. VISUALIZATIONS ---

df_top_events = pd.DataFrame(top_events_data)
df_duration = pd.DataFrame(duration_data)
df_event_type = pd.DataFrame(event_type_data)


# 1. Top 10 Most Popular Events (Bar Chart)
print("\nGenerating Top 10 Events Bar Chart...")
fig_events = px.bar(
    df_top_events, 
    x='eventId', 
    y='count', 
    title='Top 10 Most Popular Events (Global Count)',
    labels={'eventId': 'Event ID', 'count': 'Total Global Count'},
    color='count',
    color_continuous_scale=px.colors.sequential.Viridis
)
fig_events.show()


# 2. Average Transition Duration Distribution (Bar Chart)
print("\nGenerating Transition Duration Distribution...")
fig_duration = px.bar(
    df_duration, 
    x='duration_bin', 
    y='num_transitions', 
    title='Distribution of NEXT Transition Durations',
    labels={'duration_bin': 'Duration Bin', 'num_transitions': 'Number of Transitions'},
    color='num_transitions',
    category_orders={"duration_bin": ["A) < 1 sec", "B) 1-10 sec", "C) 10-60 sec", "D) > 60 sec"]}
)
fig_duration.show()


# 3. Event Type Popularity (Pie Chart)
print("\nGenerating Event Type Popularity Pie Chart...")
fig_type = px.pie(
    df_event_type, 
    values='totalCount', 
    names='eventType', 
    title='Distribution of Total Event Counts by Event Type',
    hole=.3
)
fig_type.update_traces(textposition='inside', textinfo='percent+label')
fig_type.show()


# 4. Model Performance Comparison (Bar Chart)
df_mrr = pd.DataFrame({
    'Model': ['Popularity', 'Simple Transition', 'Multi-Hop', 'Temporal Decay'],
    'MRR': [mrr_popularity, mrr_transition, mrr_multi_hop, mrr_temporal]
})
print("\nGenerating Model Performance Comparison...")
fig_mrr = px.bar(
    df_mrr,
    x='Model',
    y='MRR',
    title=f'Model MRR@{topk_limit} Comparison',
    color='MRR',
    color_continuous_scale=px.colors.sequential.Plasma
)
fig_mrr.show()

Plotly, Pandas, and defaultdict are available.

--- Extracting Top 10 Most Popular Events ---
Extracted 10 top events.

--- Extracting Average Transition Duration Distribution ---
Extracted 4 duration bins.

--- Extracting Event Type Popularity ---
Extracted 3 event types.

Generating Top 10 Events Bar Chart...


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed