In [None]:
file_name = "../../results/evaluate_50_log0606_00_int.csv"
log_folder = "../../experiments/0606_00_int/"

import pandas as pd
import matplotlib.pyplot as plt
import datetime

df = pd.read_csv(file_name, sep=",")
df.columns = [
    "machine_time", "timestamp", "person_id", "route_type", "moving_id", "late_time"
]
# remove the first row
df = df.iloc[1:]

# name the route_type
_route_type_map = {
    0: "T1/Tram",
    1: "Metro",
    3: "Bus",
    6: "Teleo"
}

df["route_type_name"] = df["route_type"].map(_route_type_map)

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df = df[df['datetime'] < datetime.datetime(2025, 5, 17)]

df.tail()

In [None]:
# Mode Shape per timestep

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['time_bin'] = df['datetime'].dt.floor('24H')
# df = df[df['time_bin'] <= "2025-05-15 00:00:00"]

dedup_df = df.drop_duplicates(subset=['time_bin', 'person_id', 'route_type_name'])
route_counts = dedup_df.groupby(['time_bin', 'route_type_name'])['person_id'].nunique().reset_index(name='person_count')

# total_persons = dedup_df.groupby('time_bin')['person_id'].nunique().reset_index(name='total_unique_persons')
total_counts = route_counts.groupby('time_bin')['person_count'].sum().reset_index(name='total_count')

result = pd.merge(route_counts, total_counts, on='time_bin')
result['ratio'] = result['person_count'] / result['total_count']

# remove last row
result = result[result['time_bin'] != result['time_bin'].max()]

# print(result.head())

import pandas as pd
import matplotlib.pyplot as plt

pivot = result.pivot(index='time_bin', columns='route_type_name', values='ratio').fillna(0)

plt.figure(figsize=(10, 6))
pivot.plot(kind='bar', stacked=False, ax=plt.gca(), colormap='tab20')

plt.title('Mode Share per Timestep')
# Format x-axis labels datetime to %Y-%m-%d
plt.xlabel('Time')
plt.ylabel('Choice Ratio')
plt.legend(title='Route Type')
plt.grid(axis='y')
plt.tight_layout()
plt.xticks(rotation=45)
plt.show()


In [None]:
import pandas as pd

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['date'] = df['datetime'].dt.date

person_day_movings = df.groupby(['date', 'person_id'])['moving_id'].apply(set).reset_index()

person_day_movings = person_day_movings.sort_values(by=['person_id', 'date'])

person_day_movings['prev_date'] = person_day_movings.groupby('person_id')['date'].shift(1)
person_day_movings['prev_moving_id'] = person_day_movings.groupby('person_id')['moving_id'].shift(1)

def has_common_moving_id(curr, prev):
    if pd.isna(prev):
        return False
    # return len(curr & prev) == len(curr)
    return set(curr) == set(prev)

def ratio_change_moving_ids(curr, prev):
    if pd.isna(prev):
        return 0.0
    return 1 - len(curr.intersection(prev)) / max(len(prev), len(curr))

person_day_movings['same_trip_as_yesterday'] = person_day_movings.apply(
    lambda row: has_common_moving_id(row['moving_id'], row['prev_moving_id']),
    axis=1
)
person_day_movings['ratio_trip_change'] = person_day_movings.apply(
    lambda row: ratio_change_moving_ids(row['moving_id'], row['prev_moving_id']),
    axis=1
)

result = person_day_movings.groupby('date').agg(
    total_persons=('person_id', 'nunique'),
    keep_same_trip=('same_trip_as_yesterday', 'sum')
).reset_index()

result['ratio'] = (1-result['keep_same_trip'] / result['total_persons']) * 100

print(result)

# Draw the graph of ratio
import matplotlib.pyplot as plt
import seaborn as sns

result['date'] = pd.to_datetime(result['date'])

highlight = result['date'].isin([
    datetime.datetime(2025, 5, 17),
    datetime.datetime(2025, 5, 18),
    datetime.datetime(2025, 5, 24),
    datetime.datetime(2025, 5, 25),
    datetime.datetime(2025, 5, 31)
])

sns.set_theme(style="ticks")
plt.figure(figsize=(8, 5))
# plt.plot(result['date'], result['ratio'], marker='o', linestyle='-')
sns.lineplot(data=result, x='date', y='ratio', marker='o', color="#e74c3c")
plt.gca().set_xticks(result['date'])

plt.scatter(
    result.loc[highlight, 'date'],
    result.loc[highlight, 'ratio'],
    color='blue',
    zorder=10,
    label='Weekends',
)

plt.title('Change Rate in Personal Travel Plan Preferences')
plt.xlabel('Date')
plt.ylabel('Change Rate (%)')
# plt.grid(True)
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Chuyển timestamp sang datetime và lấy ngày
df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['date'] = df['datetime'].dt.date

# Tính late_time max của mỗi moving_id
moving_late = df.groupby('moving_id').agg(
    max_late_time=('late_time', 'max'),
    first_timestamp=('timestamp', 'min')
).reset_index()

# Gán date cho mỗi moving_id dựa vào timestamp đầu tiên
moving_late['date'] = pd.to_datetime(moving_late['first_timestamp'], unit='s').dt.date

# Tính average late_time mỗi ngày
daily_avg_late = moving_late.groupby('date')['max_late_time'].mean().reset_index(name='avg_late_time')

# Vẽ biểu đồ
daily_avg_late['date'] = pd.to_datetime(daily_avg_late['date'])

plt.figure(figsize=(6, 5))
plt.plot(daily_avg_late['date'], daily_avg_late['avg_late_time'], marker='o')

plt.title('Average Max Late Time per Day (over trip)')
plt.xlabel('Date')
plt.ylabel('Avg arrival late time (seconds)')
plt.grid(True)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Unique moving_id per day
import pandas as pd
import matplotlib.pyplot as plt
# Mode Shape per timestep

df['datetime'] = pd.to_datetime(df['timestamp'], unit='s')
df['time_bin'] = df['datetime'].dt.floor('24H')

dedup_df = df.drop_duplicates(subset=['time_bin', 'person_id'])
moving_counts = dedup_df.groupby(['time_bin'])['moving_id'].nunique().reset_index(name='moving_count')
moving_counts.head()

# Draw the bar chat
plt.figure(figsize=(8, 5))
plt.bar(moving_counts['time_bin'], moving_counts['moving_count'], color='mediumseagreen')
plt.title('Unique Trips per Day')
plt.xlabel('Date')
plt.ylabel('Total')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(axis='y')
plt.show()


In [None]:
# People choose suboptimal route over time
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import json
import seaborn as sns
import spacy
from nltk.corpus import stopwords
import nltk
# Mode Shape per timestep

nltk.download('stopwords')

nlp = spacy.load("en_core_web_sm")

stats_df = pd.read_csv(os.path.join(log_folder, "llm_stats.csv"), sep=",")
stats_df.head()

stats_df['datetime'] = pd.to_datetime(stats_df['simulation_time'], unit='s')
# stats_df = stats_df[stats_df['datetime'] < datetime.datetime(2025, 5, 31)]
stats_df['date'] = stats_df['datetime'].dt.floor('24H')
# print(stats_df[stats_df['date'] == datetime.datetime(2025, 5, 23)].head())

all_files = stats_df['content_file'].unique()
# all_files = [os.path.join(log_folder, "chat_logs", file) for file in all_files]

def get_json_part(text: str) -> str:
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1:
        return None
    return text[start:end + 1]

def get_reason(file):
    try:
        file_path = os.path.join(log_folder, "chat_logs", file)
        with open(file_path, 'r') as f:
            content = f.read()
            if "### travel.plan" not in content:
                return None
            resp = content.split("Response: ")[1].split("------")[0]
            # json_part = get_json_part(resp)
            # json_data = json.loads(json_part)
            choice = int(resp.split('"chosen_plan":')[1].split(",")[0])
            reason = resp.split('"reason":')[1].replace('"', '').split("}")[0].strip()
            if "because it" in reason:
                reason = reason.split("because it")[1].strip()
            return reason
    except Exception as e:
        print(f"Error processing file {file}: {e}")
        return 0
    
def extract_history(file):
    try:
        file_path = os.path.join(log_folder, "chat_logs", file)
        with open(file_path, 'r') as f:
            content = f.read()
            if "### travel.plan" not in content:
                return None
            if "The related travel histories that support your decision are:" not in content:
                return None
            resp = content.split("The related travel histories that support your decision are:")[1]\
                    .split("You should consider the following factors:")[0]
            return resp.strip()
    except Exception as e:
        print(f"Error processing file {file}: {e}")
        return None

_m = {
    f: get_reason(f) for f in all_files
}
stats_df['reason'] = stats_df['content_file'].map(_m)

_m = {
    f: extract_history(f) for f in all_files
}
stats_df['history'] = stats_df['content_file'].map(_m)

stats_df = stats_df[stats_df['history'].notna()]

# print(stats_df.head())

stop_word_list = set(stopwords.words('english'))
def jaccard_similarity(text1_list, text2):
    all_ = []
    for text1 in text1_list:
        # set1 = set(text1.split())
        # set2 = set(text2.split())
        set1 = set(word_tokenize(text1.lower())) - stop_word_list
        set2 = set(word_tokenize(text2.lower())) - stop_word_list
        intersection = set1.intersection(set2)
        union = set1.union(set2)
        all_.append(len(intersection) / len(union))
    return max(all_) if all_ else 0

def jaccard_similarity_ner(text1_list, text2):
    all_ = []
    for text1 in text1_list:
        set1 = set([ent.text for ent in nlp(text1).ents])
        set2 = set([ent.text for ent in nlp(text2).ents])
        intersection = set1.intersection(set2)
        union = set1.union(set2)
        all_.append(len(intersection) / len(union))
    return max(all_) if all_ else 0

# stats_df['history_similarity'] = stats_df.apply(
#     lambda row: jaccard_similarity(row['history'].split("\n"), row['reason']),
#     axis=1
# )
# print(stats_df['history_similarity'].describe())

def count_mentioned_terms(text):
    terms = ["short", "quick", "fast"]
    count = 0
    for term in terms:
        count += text.lower().count(term)
    return count


from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from tqdm import tqdm

def calculate_similarities(sentence1_list, sentence2):
    all_ = []
    for sentence1 in sentence1_list:
        ref = [word_tokenize(sentence1.lower())]
        cand = word_tokenize(sentence2.lower())
        smoothie = SmoothingFunction().method2
        bleu = sentence_bleu(ref, cand, smoothing_function=smoothie)
        all_.append(bleu)

    return max(all_) if all_ else 0

tqdm.pandas()

stats_df = stats_df[(stats_df['date'] <= datetime.datetime(2025, 5, 16)) & (stats_df['date'] >= datetime.datetime(2025, 5, 12))]

# stats_df['bleu_similarity'] = stats_df.progress_apply(
#     lambda row: calculate_similarities(row['history'].split("\n"), row['reason']),
#     axis=1
# )

# plt.figure(figsize=(8, 5))
# sns.boxplot(x='date', y='bleu_similarity', data=stats_df, showfliers=False, palette='crest')
# plt.title('Jaccard Similarity of Experiences and Reasoning Over Time')
# plt.ylabel('Jaccard Value')
# plt.xlabel('Date')
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()



stats_df['num_mentioned_terms'] = stats_df['reason'].progress_apply(
    lambda x: count_mentioned_terms(x)
)

for text in stats_df[(stats_df['num_mentioned_terms'] == 0) & (stats_df['date'] ==  datetime.datetime(2025, 5, 15))]["reason"].head(100):
    print(text)

# Calculate ratio of num_mentioned_terms == 0 per date
zero_terms_ratio = stats_df.groupby('date').apply(
    lambda x: (x['num_mentioned_terms'] > 0).sum() / len(x)
).reset_index(name='ratio_zero_terms')
# Highlight points where ratio_zero_terms < 0.85 in red
highlight = zero_terms_ratio['date'].isin([
    datetime.datetime(2025, 5, 17),
    datetime.datetime(2025, 5, 18),
    datetime.datetime(2025, 5, 24),
    datetime.datetime(2025, 5, 25),
    datetime.datetime(2025, 5, 31)
])

plt.figure(figsize=(8, 5))
sns.lineplot(x='date', y='ratio_zero_terms', data=zero_terms_ratio, marker='o')
plt.scatter(
    zero_terms_ratio.loc[highlight, 'date'],
    zero_terms_ratio.loc[highlight, 'ratio_zero_terms'],
    color='red',
    zorder=10,
    label='Low ratio'
)
plt.title('Ratio of the number of decisions that rely on the shortest path')
plt.ylabel('Ratio of decisions')
plt.xlabel('Date')
plt.xticks(rotation=90)
plt.gca().set_xticks(result['date'])
plt.tight_layout()
plt.show()

In [None]:
# People choose suboptimal route over time
import pandas as pd
import matplotlib.pyplot as plt
import os
import json
import seaborn as sns
# Mode Shape per timestep

stats_df = pd.read_csv(os.path.join(log_folder, "llm_stats.csv"), sep=",")
stats_df.head()

all_files = stats_df['content_file'].unique()
# all_files = [os.path.join(log_folder, "chat_logs", file) for file in all_files]

def get_json_part(text: str) -> str:
    start = text.find("{")
    end = text.rfind("}")
    if start == -1 or end == -1:
        return None
    return text[start:end + 1]

def check_suboptimal_choice(file):
    try:
        file_path = os.path.join(log_folder, "chat_logs", file)
        with open(file_path, 'r') as f:
            content = f.read()
            if "### travel.plan" not in content:
                return 0
            resp = content.split("Response: ")[1].split("------")[0]
            # json_part = get_json_part(resp)
            # json_data = json.loads(json_part)
            choice = int(resp.split('"chosen_plan":')[1].split(",")[0])
            # suboptimal = json_data["chosen_plan"] != 1
            suboptimal = choice != 1
            return 1 if suboptimal else -1
    except Exception as e:
        print(f"Error processing file {file}: {e}")
        return 0

_m = {
    f: check_suboptimal_choice(f) for f in all_files
}

stats_df['suboptimal_choice'] = stats_df['content_file'].map(_m)
print(stats_df.head())
# stats_df['suboptimal_choice'].describe()

df = stats_df[stats_df['suboptimal_choice'] != 0]
# count number of suboptimal choices and total choices over time windows of 24 hours
df['datetime'] = pd.to_datetime(df['simulation_time'], unit='s')
df = df[df['datetime'] < datetime.datetime(2025, 5, 17)]
df['time_bin'] = df['datetime'].dt.floor('24H')

data = df.groupby(['time_bin']).agg(
    suboptimal_count=('suboptimal_choice', lambda x: (x == 1).sum()),
    total_count=('suboptimal_choice', 'count')
).reset_index()

# ratio of suboptimal choices
data['ratio'] = (data['suboptimal_count'] / data['total_count']) * 100
data

sns.set_theme(style="ticks")
# Draw the graph of ratio
plt.figure(figsize=(8, 5))
# plt.bar(data['time_bin'], data['ratio'], color='skyblue')
sns.barplot(
    data=data, 
    x='time_bin', 
    y='ratio',
    # palette='mako',
    color="#1f77b4",
    edgecolor='black'
)
plt.title('Percentage of Suboptimal Route Choice by Day')
plt.xlabel('Date')
plt.ylabel('Suboptimal Choice Percentage (%)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.grid(axis='y')
plt.show()