In [1]:
import pandas as pd
import json
from typing import Union, Dict, List
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [2]:
with open('data/aura_ca.behavior_decision_log.json', 'r') as f:
    data = json.load(f)

In [3]:
def json_to_dataframe(json_data: Union[str, Dict, List]) -> pd.DataFrame:
    """
    Convert JSON data to a pandas DataFrame.
    
    Args:
        json_data: JSON data in one of the following formats:
            - JSON string
            - Python dictionary
            - List of dictionaries
    
    Returns:
        pandas DataFrame containing the JSON data
        
    Raises:
        ValueError: If the input JSON data is invalid or cannot be converted to a DataFrame
    """
    try:
        # If input is a string, parse it as JSON
        if isinstance(json_data, str):
            json_data = json.loads(json_data)
            
        # Convert to DataFrame
        df = pd.DataFrame(json_data)
        return df
    except Exception as e:
        raise ValueError(f"Failed to convert JSON to DataFrame: {str(e)}")

In [4]:
df = json_to_dataframe(data)

In [5]:
# df

In [6]:
# print("columns types")
# print(df.dtypes)
# print(f'\ndf info')
# print(df.info())
# print(f'\nMissing Values')
# print(df.isna().sum())

In [7]:
# columns types conversion
df['_id'] = df['_id'].apply(lambda x: x['$oid'] if isinstance(x, dict) else None)
df['timestamp'] = df['timestamp'].apply(lambda x: x['$numberLong'] if isinstance(x, dict) else None).astype(np.int64)


In [8]:
# create datetime
df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')

In [9]:
timespan = (max(df['datetime'].dt.date) - min(df['datetime'].dt.date)).days
print(f'timespan of the data is {timespan} days')
non_active_days = timespan - df['datetime'].dt.date.nunique()

active_days = len(np.where(df['datetime'].dt.date.value_counts() > 10)[0])
print(f'Active days with over 10 entries per day: {active_days}')

timespan of the data is 91 days
Active days with over 10 entries per day: 19


In [10]:
# Create the initial DataFrame with counts
pseudonym_data = df['pseudonym'].value_counts().reset_index()
pseudonym_data.columns = ['pseudonym', 'count']

# Calculate all statistics at once using groupby
stats = df.groupby('pseudonym').agg({
    'datetime': ['min', 'max'],  # start_time and end_time
    'result': ['sum', 'count']   # true_results and total count
}).reset_index()

# Flatten the multi-index columns
stats.columns = ['pseudonym', 'start_time', 'end_time', 'true_result', 'total_count']

# Merge with the original counts
pseudonym_data = pseudonym_data.merge(stats, on='pseudonym')

# Calculate false_result
pseudonym_data['false_result'] = pseudonym_data['total_count'] - pseudonym_data['true_result']
pseudonym_data['duration'] = pseudonym_data['end_time'] - pseudonym_data['start_time']
pseudonym_data['true_result in %'] = round(100 * pseudonym_data['true_result']/ pseudonym_data['total_count'],2)

In [11]:
pseudonym_data.reset_index(inplace=True)
pseudonym_data.rename(columns={"index":"pseudonym_id"},inplace=True)
pseudonym_data['pseudonym_id'] = 'pseudonym_' + pseudonym_data['pseudonym_id'].astype(str)

In [12]:
# pseudonym_data.head(5)

In [13]:
# print(f"User '{pseudonym_data['pseudonym'][0]}' (Short ID: {pseudonym_data['pseudonym_id'][0]}) has the highest activity with {pseudonym_data['total_count'][0]} total entries.")
# print(f"The user's participation period extends from {pseudonym_data['start_time'][0].strftime('%Y-%m-%d')} through {pseudonym_data['end_time'][0].strftime('%Y-%m-%d')}.")

In [14]:
# # First, create a complete date range
# date_range = pd.date_range(start=df['datetime'].min().date(), 
#                           end=df['datetime'].max().date())

# # Convert to date for comparison
# date_range = date_range.date

# # Calculate daily counts for all records
# daily_counts = df['datetime'].dt.date.value_counts().reindex(date_range, fill_value=0).sort_index()

# # Calculate false results, reindexing to include all dates
# daily_true = (df[df['result'] == True]['datetime']
#               .dt.date
#               .value_counts()
#               .reindex(date_range, fill_value=0)
#               .sort_index())

# plt.figure(figsize=(15, 6))
# # Plot total counts
# daily_counts.plot(kind='line', marker='o', label='Total Records')
# # Plot false results
# daily_true.plot(kind='line', label='False Results', color='red')

# plt.xlabel('Date')
# plt.ylabel('Count')
# plt.title('Daily Record Counts')
# plt.xticks(rotation=45)
# plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
# plt.grid(True, alpha=0.3)
# plt.legend()
# plt.tight_layout()
# plt.show()

In [15]:
# fig = px.scatter(df, 
#                  x='datetime', 
#                  y='result',
#                  title='Result vs. Datetime',
#                  labels={
#                      'datetime': 'Datetime',
#                      'result': 'Result'
#                  },
#                  color='result',  # Color points by result
#                  size_max=10)     # Maximum size of points

# # Customize the layout
# fig.update_layout(
#     xaxis_tickangle=-45,
#     showlegend=False,
#     height=300,
#     xaxis=dict(
#         tickformat='%Y-%m-%d',  # Format date display
#         nticks=20              # Number of ticks to display
#     )
# )

# # Customize hover information
# fig.update_traces(
#     hovertemplate="<br>".join([
#         "Date: %{x}",
#         "Result: %{y}"
#     ])
# )

# fig.show()

In [16]:
# # Create subplots
# fig = make_subplots(rows=2, cols=2,
#                    subplot_titles=('Success Rate vs Duration',
#                                  'Attempt Distribution',
#                                  'Success Rate Distribution',
#                                  'Activity Timeline'))

# # Add scatter plot
# fig.add_trace(
#     go.Scatter(x=pseudonym_data['duration'],
#               y=pseudonym_data['true_result in %'],
#               mode='markers',
#               marker=dict(size=pseudonym_data['total_count']/10,
#                          color=pseudonym_data['total_count'],
#                          showscale=True),
#               text=pseudonym_data['pseudonym_id'],
#               hoverinfo='text+x+y',
#               name='Success Rate'),
#     row=1, col=1
# )

# # Add histogram for total attempts
# fig.add_trace(
#     go.Histogram(x=pseudonym_data['total_count'],
#                 name='Attempt Distribution'),
#     row=1, col=2
# )

# # Add histogram for success rate
# fig.add_trace(
#     go.Histogram(x=pseudonym_data['true_result in %'],
#                 name='Success Rate Distribution'),
#     row=2, col=1
# )

# # Add timeline
# fig.add_trace(
#     go.Scatter(x=pseudonym_data['start_time'],
#               # y=pseudonym_data['pseudonym'],
#               mode='markers',
#                marker=dict(size=np.where(pseudonym_data['total_count']/100 < 1, 5, pseudonym_data['total_count']/100),
#                            color=pseudonym_data['total_count'],
#                          showscale=True),
#               name='Start Time',
#                text = pseudonym_data['pseudonym_id'],
#                hoverinfo='text+x'
#               ),
#     row=2, col=2
# )

# # Update layout
# fig.update_layout(height=800, width=1200,
#                  title_text="Pseudonym Analysis Dashboard",
#                  showlegend=False)

# fig.show()

In [17]:
# # Sort the data
# sorted_data = pseudonym_data.sort_values('total_count', ascending=False)

# fig = go.Figure()

# fig.add_trace(go.Bar(
#     x=sorted_data['pseudonym_id'],
#     y=sorted_data['true_result'],
#     name='True Results',
#     marker_color='green'
# ))

# fig.add_trace(go.Bar(
#     x=sorted_data['pseudonym_id'],
#     y=sorted_data['false_result'],
#     name='False Results',
#     marker_color='red'
# ))

# fig.update_layout(
#     barmode='stack',
#     title='True/False Results by Pseudonym (Sorted by Total Count)',
#     xaxis_title='Pseudonym',
#     yaxis_title='Count',
#     xaxis_tickangle=-45,
#     height=600,
#     width=1000
# )

# fig.show()