In [None]:
from azure.identity import DefaultAzureCredential
import dotenv

from repository.conversation_repository import ConversationRepository

dotenv.load_dotenv()

True

In [4]:
from datetime import datetime
from typing import override
import os

from azure.data.tables import TableServiceClient

import json
from dao.chat_table_dao import ChatTableDaoImpl
from entity.table_row_entity import (
    ChatTableRow,
)

# get dir of this file

dir_path = os.getcwd()


class DiskCachingChatTableDaoImpl(ChatTableDaoImpl):
    """
    This class is a ChatTableDaoImpl that caches the results of the `all` method to disk
    in a JSON file. This is useful for development purposes, as it allows us to avoid
    making repeated requests to the Azure Table Storage service.
    """

    def __init__(
        self,
        table_service_client: TableServiceClient,
        # but from dir of this file
        cache_filepath: str = os.path.join(
            dir_path, "./.cache/chat_table_cache.global.json"
        ),
    ):
        super().__init__(table_service_client)
        self.cache: list[ChatTableRow] = []
        self.cache_filepath = cache_filepath

    @override
    def all(self) -> list[ChatTableRow]:
        # try and get pickle
        if len(self.cache) == 0:
            try:
                # create path if not exist
                os.makedirs(os.path.dirname(self.cache_filepath), exist_ok=True)
                with open(self.cache_filepath, "r") as f:
                    json_content = json.load(f)
                    # convert all timestamps to datetime
                    for row in json_content:
                        timestamp_str = row["metadata"]["timestamp"]
                        if timestamp_str is not None:
                            row["metadata"]["timestamp"] = datetime.fromisoformat(
                                timestamp_str
                            )
                    self.cache = json_content
            except FileNotFoundError:
                self.cache = []

        # if cache is still empty, get from super
        if len(self.cache) == 0:
            self.cache = super().all()
            # also save to disk.
            with open(self.cache_filepath, "w") as f:
                # json.dump(self.cache, f)
                json.dump(
                    self.cache,
                    f,
                    default=lambda o: o.isoformat()
                    if isinstance(o, datetime)
                    else o.__dict__,
                )

        return self.cache


In [5]:

# Get conversations
credential = DefaultAzureCredential()
table_service_client = TableServiceClient(endpoint=os.getenv("DATABASE_ENDPOINT") or "", credential=credential)
chat_table_dao = DiskCachingChatTableDaoImpl(table_service_client)
conversation_repo = ConversationRepository(chat_table_dao)
conversations = conversation_repo.list_conversations(log_validation_errors=False)

# Date range
date_ranges = [
    ("Jan 2025", "2025-01-01T00:00:00Z", "2025-01-16T23:59:59Z"),
    ("Dec 2024", "2024-12-01T00:00:00Z", "2024-12-31T23:59:59Z"),
    ("Nov 2024", "2024-11-01T00:00:00Z", "2024-11-30T23:59:59Z"),
    ("Oct 2024", "2024-10-01T00:00:00Z", "2024-10-31T23:59:59Z"),
    ("Sep 2024", "2024-09-01T00:00:00Z", "2024-09-30T23:59:59Z"),
    ("Aug 2024", "2024-08-01T00:00:00Z", "2024-08-31T23:59:59Z"),
    ("Jul 2024", "2024-07-01T00:00:00Z", "2024-07-31T23:59:59Z"),
    ("Jun 2024", "2024-06-01T00:00:00Z", "2024-06-30T23:59:59Z"),
    ("May 2024", "2024-05-01T00:00:00Z", "2024-05-31T23:59:59Z"),
    ("Lifetime", "2024-05-01T00:00:00Z", "2025-01-31T23:59:59Z")
]

headers = ["Month", "Active users", "Total Questions Asked", "Average questions asked per day", "Average questions per user"]
header_format = "{:<15} {:<15} {:<25} {:<35} {:<25}"
row_format = "{:<15} {:<15} {:<25} {:<35} {:<25}"

print(header_format.format(*headers))

for date_range in date_ranges:
    # Active users
    active_users: set[str] = set()
    for conversation in conversations:
        for message in conversation["messages"]:
            owner_id = message["owner_id"]
            if owner_id is not None and date_range[1] <= message["created_at"] <= date_range[2]:
                active_users.add(owner_id)

    active_users_count = len(active_users)

    # Total questions asked
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if date_range[1] <= message["created_at"] <= date_range[2] and message["sender"] == "user":
                total_questions_asked += 1

    # Average questions asked per day
    start_day = datetime.fromisoformat(date_range[1])
    end_day = datetime.fromisoformat(date_range[2])
    days_in_month = (end_day - start_day).days + 1
    average_questions_per_day = total_questions_asked / days_in_month

    # Average questions per user
    average_questions_per_user = total_questions_asked / active_users_count if active_users_count > 0 else 0

    print(row_format.format(
        date_range[0],
        active_users_count,
        total_questions_asked,
        f"{average_questions_per_day:.2f}",
        f"{average_questions_per_user:.2f}"
    ))

total_question_characters = 0
for conversation in conversations:
    for message in conversation["messages"]:
        if message["sender"] == "user":
            total_question_characters += len(message["content"])

print(f"Total question characters: {total_question_characters}")

FileNotFoundError: [Errno 2] No such file or directory: '/home/monarch/workspace/ssc-assistant/app/api/src/./.cache/chat_table_cache.global.json'

In [6]:
# Statistics by day of week
headers = ["Day of week", "Total Questions Asked", "Average questions asked per day", "Average questions per user"]
header_format = "{:<15} {:<25} {:<35} {:<25}"
row_format = "{:<15} {:<25} {:<35} {:<25}"
days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

active_users: set[str] = set()
for conversation in conversations:
    for message in conversation["messages"]:
        owner_id = message["owner_id"]
        if owner_id is not None:
            active_users.add(owner_id)

active_users_count = len(active_users)

print(header_format.format(*headers))

for day in days:
    # Total questions asked
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).strftime("%A") == day:
                total_questions_asked += 1

    # Average questions asked per day
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).strftime("%A") == day:
                total_questions_asked += 1

    # Average questions per user
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).strftime("%A") == day:
                total_questions_asked += 1
    
    average_questions_per_day = total_questions_asked / len(date_ranges)
    average_questions_per_user = total_questions_asked / active_users_count if active_users_count > 0 else 0

    print(row_format.format(
        day,
        total_questions_asked,
        f"{average_questions_per_day:.2f}",
        f"{average_questions_per_user:.2f}"
    ))


Day of week     Total Questions Asked     Average questions asked per day     Average questions per user
Monday          10435                     1043.50                             2.10                     
Tuesday         14513                     1451.30                             2.93                     
Wednesday       14930                     1493.00                             3.01                     
Thursday        14980                     1498.00                             3.02                     
Friday          11885                     1188.50                             2.40                     
Saturday        357                       35.70                               0.07                     
Sunday          332                       33.20                               0.07                     


In [7]:
# Statistics by day of month

headers = ["Day of month", "Total Questions Asked", "Average questions asked per day", "Average questions per user"]
header_format = "{:<15} {:<25} {:<35} {:<25}"
row_format = "{:<15} {:<25} {:<35} {:<25}"

print(header_format.format(*headers))

for day in range(1, 32):
    # Total questions asked
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).day == day:
                total_questions_asked += 1

    # Average questions asked per day
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).day == day:
                total_questions_asked += 1

    # Average questions per user
    total_questions_asked = 0
    for conversation in conversations:
        for message in conversation["messages"]:
            if message["sender"] == "user" and datetime.fromisoformat(message["created_at"]).day == day:
                total_questions_asked += 1
    
    average_questions_per_day = total_questions_asked / len(date_ranges)
    average_questions_per_user = total_questions_asked / active_users_count if active_users_count > 0 else 0

    print(row_format.format(
        day,
        total_questions_asked,
        f"{average_questions_per_day:.2f}",
        f"{average_questions_per_user:.2f}"
    ))

Day of month    Total Questions Asked     Average questions asked per day     Average questions per user
1               1199                      119.90                              0.24                     
2               1589                      158.90                              0.32                     
3               2215                      221.50                              0.45                     
4               2171                      217.10                              0.44                     
5               2149                      214.90                              0.43                     
6               2329                      232.90                              0.47                     
7               2002                      200.20                              0.40                     
8               2279                      227.90                              0.46                     
9               2045                      204.50               