In [None]:
import pandas as pd
import numpy as np
import datetime

file_path = "../../experiments/deepseek-r1-distill-llama-70b_01/people_travel_data.jsonl"

paths = [
    "../../experiments/deepseek-r1-distill-llama-70b_01/people_travel_data.jsonl",
    "../../experiments/gpt-oss-120b_01/people_travel_data.jsonl",
    "../../experiments/qwen-3.2-32b_1/people_travel_data.jsonl"
]

titles = [
    "DeepSeek",
    "GPT-OSS",
    "Qwen-3.2"
]

data = []

for file_path in paths:
    df = pd.read_json(file_path, lines=True)

    print(f"Before: {df.shape[0]} records")

    df = df[df['timestamp'] <= datetime.datetime(2025, 3, 25)]

    print(f"After: {df.shape[0]} records")

    data.append(df)

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import seaborn as sns

def analyze_transit_data(df):
    # Convert timestamp to datetime
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
    df['date'] = df['datetime'].dt.date
    
    # Calculate number of itinerary options (non-empty itineraries)
    df['num_options'] = df['itineraries'].apply(lambda x: len([item for item in x if item != ""]))
    
    print("Initial data:")
    print(df[['person_id', 'datetime', 'plan_code', 'purpose', 'num_options']].head(10))
    print(f"\nTotal records: {len(df)}")
    print(f"Unique persons: {df['person_id'].nunique()}")
    print(f"Unique activities: {df['activity_id'].nunique()}")
    
    # Group by person_id and activity_id, sort by time
    df_sorted = df.sort_values(['person_id', 'activity_id', 'timestamp'])
    
    # Calculate plan_code changes for each activity of each person
    change_data = []
    
    # Group by (person_id, activity_id)
    for (person_id, activity_id), group in df_sorted.groupby(['person_id', 'activity_id']):
        activity_data = group.copy().reset_index(drop=True)
        
        if len(activity_data) > 1:
            # Calculate plan_code changes for this activity
            activity_data['prev_plan_code'] = activity_data['plan_code'].shift(1)
            activity_data['plan_changed'] = (activity_data['plan_code'] != activity_data['prev_plan_code']) & activity_data['prev_plan_code'].notna()
            
            # Remove first record of this activity (no previous)
            activity_data = activity_data.iloc[1:]
            
            for _, row in activity_data.iterrows():
                change_data.append({
                    'person_id': person_id,
                    'activity_id': activity_id,
                    'date': row['date'],
                    'datetime': row['datetime'],
                    'plan_changed': row['plan_changed'],
                    'current_plan': row['plan_code'],
                    'previous_plan': row['prev_plan_code'],
                    'purpose': row['purpose'],
                    'num_options': row['num_options']
                })
    
    # Calculate total number of activities per person
    person_activity_counts = df.groupby('person_id')['activity_id'].nunique().reset_index()
    person_activity_counts.columns = ['person_id', 'total_activities']
    
    print(f"\nTotal activities per person:")
    print(person_activity_counts.sort_values('total_activities', ascending=False))
    
    # Statistics on number of options per activity by date
    daily_options_stats = df.groupby('date')['num_options'].agg(['mean', 'std', 'min', 'max', 'count']).round(2)
    daily_options_stats.columns = ['avg_options', 'std_options', 'min_options', 'max_options', 'total_activities']
    
    print(f"\nNumber of itinerary options per activity by date:")
    print(daily_options_stats)
    
    if not change_data:
        print("Not enough data to calculate changes (each activity needs at least 2 records)")
        return
    
    # Create DataFrame for change data
    change_df = pd.DataFrame(change_data)
    
    print(f"\nPlan_code change data by activity:")
    print(change_df.head())
    print(f"Activities with changes: {len(change_df)}")
    print(f"Unique activities: {change_df['activity_id'].nunique()}")
    
    # Calculate change rate by date
    daily_changes = change_df.groupby('date').agg({
        'plan_changed': ['sum', 'count']
    }).round(3)
    daily_changes.columns = ['changes', 'total_activity_changes']
    daily_changes['change_rate'] = (daily_changes['changes'] / daily_changes['total_activity_changes'] * 100).round(2)
    
    print(f"\nChange rate by date:")
    print(daily_changes)
    
    # Calculate change rate by activity
    activity_changes = change_df.groupby(['person_id', 'activity_id']).agg({
        'plan_changed': ['sum', 'count']
    })
    activity_changes.columns = ['changes', 'total_records']
    activity_changes['change_rate'] = (activity_changes['changes'] / activity_changes['total_records'] * 100).round(2)
    
    print(f"\nChange rate by activity (Top 10):")
    print(activity_changes.sort_values('change_rate', ascending=False).head(10))
    
    # Calculate change rate by person (aggregated from activities)
    person_changes = change_df.groupby('person_id').agg({
        'plan_changed': ['sum', 'count']
    })
    person_changes.columns = ['changes', 'total_activity_records']
    person_changes['change_rate'] = (person_changes['changes'] / person_changes['total_activity_records'] * 100).round(2)
    
    # Merge with activity counts
    person_changes = person_changes.merge(person_activity_counts.set_index('person_id'), left_index=True, right_index=True)
    
    print(f"\nChange rate by person (aggregated):")
    print(person_changes.sort_values('change_rate', ascending=False))
    
    # Analysis by purpose
    purpose_changes = change_df.groupby('purpose').agg({
        'plan_changed': ['sum', 'count']
    })
    purpose_changes.columns = ['changes', 'total_records']
    purpose_changes['change_rate'] = (purpose_changes['changes'] / purpose_changes['total_records'] * 100).round(2)
    
    print(f"\nChange rate by purpose:")
    print(purpose_changes)
    
    # Calculate average change rate by date (by person)
    daily_person_changes = change_df.groupby(['date', 'person_id']).agg({
        'plan_changed': ['sum', 'count']
    })
    daily_person_changes.columns = ['changes', 'total_records']
    daily_person_changes['change_rate'] = (daily_person_changes['changes'] / daily_person_changes['total_records'] * 100).round(2)
    
    # Calculate average change rate of all persons for each date
    daily_avg_change_rate = daily_person_changes.groupby('date')['change_rate'].agg(['mean', 'std', 'min', 'max']).round(2)
    daily_avg_change_rate.columns = ['avg_change_rate', 'std_change_rate', 'min_change_rate', 'max_change_rate']
    
    print(f"\nAverage change rate by date (by person):")
    print(daily_avg_change_rate)

    return {
        "change_df": change_df,
        "daily_changes": daily_changes,
        "person_changes": person_changes,
        "activity_changes": activity_changes,
        "purpose_changes": purpose_changes,
        "daily_avg_change_rate": daily_avg_change_rate,
        "daily_options_stats": daily_options_stats,
        "person_activity_counts": person_activity_counts
    }

# Run analysis
if __name__ == "__main__":
    results = [analyze_transit_data(df) for df in data]

    daily_avg_change_rate = results[0]['daily_avg_change_rate']
    days = range(len(daily_avg_change_rate))

    markers = ['o', 's', '^', 'D', 'X']
    lines = ['-', '--', '-.', ':']
    colors = sns.color_palette("tab10", len(titles))

    plt.figure(figsize=(12, 6))
    for i, (title, d) in enumerate(zip(titles, results)):
        plt.plot(days, d['daily_avg_change_rate']['avg_change_rate'],
                 marker=markers[i % len(markers)], linestyle=lines[i % len(lines)], linewidth=2, markersize=8, label=title, color=colors[i])
    plt.title('Average Change Rate by Date (by Person)')
    plt.xlabel('Date')
    plt.ylabel('Change Rate (%)')
    plt.xticks(days, [str(date) for date in daily_avg_change_rate.index], rotation=45)
    plt.legend()
    plt.grid(True, alpha=0.3)
