## IMPORTS

In [6]:
# Recommended imports for log analysis and data exploration

import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import confusion_matrix, cohen_kappa_score

In [7]:
# Load all CSV files from log data with simpler names
print("🔍 Loading CSV files from log data...")

# Define base path to logs
logs_path = "data/logs"

# Initialize variables for different log types
session_logs = None
rep_logs = None
biomech_logs = None
eval_frames = None
eval_reps = None
eval_cues = None
ml_training = None

# Load session logs
try:
    session_files = glob.glob(os.path.join(logs_path, "sessions", "*.csv"))
    if session_files:
        session_dfs = [pd.read_csv(f) for f in session_files]
        session_logs = pd.concat(session_dfs, ignore_index=True)
        print(f"✅ Loaded session_logs: {len(session_logs)} rows from {len(session_files)} files")
    else:
        print("❌ No session log files found")
except Exception as e:
    print(f"❌ Error loading session logs: {e}")

# Load rep logs  
try:
    rep_files = glob.glob(os.path.join(logs_path, "reps", "*.csv"))
    if rep_files:
        rep_dfs = [pd.read_csv(f) for f in rep_files]
        rep_logs = pd.concat(rep_dfs, ignore_index=True)
        print(f"✅ Loaded rep_logs: {len(rep_logs)} rows from {len(rep_files)} files")
    else:
        print("❌ No rep log files found")
except Exception as e:
    print(f"❌ Error loading rep logs: {e}")

# Load biomechanics logs
try:
    biomech_files = glob.glob(os.path.join(logs_path, "biomechanics", "*.csv"))
    if biomech_files:
        biomech_dfs = [pd.read_csv(f) for f in biomech_files]
        biomech_logs = pd.concat(biomech_dfs, ignore_index=True)
        print(f"✅ Loaded biomech_logs: {len(biomech_logs)} rows from {len(biomech_files)} files")
    else:
        print("❌ No biomechanics log files found")
except Exception as e:
    print(f"❌ Error loading biomech logs: {e}")

# Load evaluation logs
try:
    # Evaluation frames
    eval_frame_files = glob.glob(os.path.join(logs_path, "evaluation", "evaluation_frames_*.csv"))
    if eval_frame_files:
        eval_frame_dfs = [pd.read_csv(f) for f in eval_frame_files]
        eval_frames = pd.concat(eval_frame_dfs, ignore_index=True)
        print(f"✅ Loaded eval_frames: {len(eval_frames)} rows from {len(eval_frame_files)} files")
    
    # Evaluation reps
    eval_rep_files = glob.glob(os.path.join(logs_path, "evaluation", "evaluation_reps_*.csv"))
    if eval_rep_files:
        eval_rep_dfs = [pd.read_csv(f) for f in eval_rep_files]
        eval_reps = pd.concat(eval_rep_dfs, ignore_index=True)
        print(f"✅ Loaded eval_reps: {len(eval_reps)} rows from {len(eval_rep_files)} files")
    
    # Evaluation cues
    eval_cue_files = glob.glob(os.path.join(logs_path, "evaluation", "evaluation_cues_*.csv"))
    if eval_cue_files:
        eval_cue_dfs = [pd.read_csv(f) for f in eval_cue_files]
        eval_cues = pd.concat(eval_cue_dfs, ignore_index=True)
        print(f"✅ Loaded eval_cues: {len(eval_cues)} rows from {len(eval_cue_files)} files")
        
except Exception as e:
    print(f"❌ Error loading evaluation logs: {e}")

# Load ML training logs
try:
    ml_files = glob.glob(os.path.join(logs_path, "ml_training", "*.csv"))
    if ml_files:
        ml_dfs = [pd.read_csv(f) for f in ml_files]
        ml_training = pd.concat(ml_dfs, ignore_index=True)
        print(f"✅ Loaded ml_training: {len(ml_training)} rows from {len(ml_files)} files")
    else:
        print("❌ No ML training log files found")
except Exception as e:
    print(f"❌ Error loading ML training logs: {e}")

# Summary
print(f"\n📊 LOADED DATASETS SUMMARY:")
datasets = {
    'session_logs': session_logs,
    'rep_logs': rep_logs, 
    'biomech_logs': biomech_logs,
    'eval_frames': eval_frames,
    'eval_reps': eval_reps,
    'eval_cues': eval_cues,
    'ml_training': ml_training
}

for name, df in datasets.items():
    if df is not None:
        print(f"   {name}: {len(df)} rows, {len(df.columns)} columns")
    else:
        print(f"   {name}: Not loaded")

🔍 Loading CSV files from log data...
✅ Loaded session_logs: 29 rows from 1 files
✅ Loaded rep_logs: 271 rows from 1 files
✅ Loaded biomech_logs: 16064 rows from 1 files
✅ Loaded eval_frames: 16064 rows from 1 files
✅ Loaded eval_reps: 242 rows from 1 files
✅ Loaded eval_cues: 399 rows from 1 files
✅ Loaded ml_training: 16064 rows from 1 files

📊 LOADED DATASETS SUMMARY:
   session_logs: 29 rows, 26 columns
   rep_logs: 271 rows, 52 columns
   biomech_logs: 16064 rows, 31 columns
   eval_frames: 16064 rows, 16 columns
   eval_reps: 242 rows, 18 columns
   eval_cues: 399 rows, 10 columns
   ml_training: 16064 rows, 50 columns


In [9]:
# Step 3: Examine the structure of each loaded dataset
print("🔍 EXAMINING DATASET STRUCTURES")
print("=" * 50)

# Check which datasets were actually loaded
loaded_datasets = [(name, df) for name, df in datasets.items() if df is not None]

if not loaded_datasets:
    print("❌ No datasets were loaded. Check your data/logs directory structure.")
else:
    for name, df in loaded_datasets:
        print(f"\n📊 {name.upper()}:")
        print(f"   Shape: {df.shape} (rows, columns)")
        print(f"   Columns: {list(df.columns)}")
        
        # Show first few rows
        print(f"   Sample data:")
        print(df.head(2).to_string(max_cols=8))
        
        # Check for missing values
        missing_data = df.isnull().sum()
        if missing_data.any():
            print(f"   Missing values: {missing_data[missing_data > 0].to_dict()}")
        else:
            print(f"   Missing values: None")

🔍 EXAMINING DATASET STRUCTURES

📊 SESSION_LOGS:
   Shape: (29, 26) (rows, columns)
   Columns: ['session_id', 'user_id', 'timestamp', 'session_start', 'session_end', 'total_duration_seconds', 'total_reps', 'completed_reps', 'failed_reps', 'average_form_score', 'best_form_score', 'worst_form_score', 'total_faults', 'safety_faults', 'form_faults', 'depth_faults', 'user_skill_level', 'difficulty_level', 'difficulty_changes_count', 'exercise_type', 'voice_feedback_enabled', 'session_quality_score', 'improvement_score', 'fatigue_detected', 'session_notes', 'system_version']
   Sample data:
                      session_id user_id     timestamp               session_start  ... improvement_score  fatigue_detected  session_notes  system_version
0  session_20250909_154629_mayoa   mayoa  1.757429e+09  2025-09-09T15:46:29.579022  ...          7.272727             False            NaN             2.0
1  session_20250909_155459_mayob   mayob  1.757430e+09  2025-09-09T15:54:59.356400  ...         -0

In [10]:
# Step 4: Identify what data you actually have for analysis
print("\n🎯 DATA AVAILABILITY ASSESSMENT")
print("=" * 50)

# Check if you have the key datasets needed for quantitative analysis
key_datasets = {
    'eval_frames': eval_frames,
    'eval_reps': eval_reps,
    'eval_cues': eval_cues
}

analysis_possible = {}

if eval_frames is not None:
    analysis_possible['frame_analysis'] = True
    print("✅ Frame-level data available for temporal analysis")
else:
    analysis_possible['frame_analysis'] = False
    print("❌ No frame-level evaluation data found")

if eval_reps is not None:
    analysis_possible['rep_analysis'] = True
    print("✅ Rep-level data available for accuracy analysis")
else:
    analysis_possible['rep_analysis'] = False
    print("❌ No rep-level evaluation data found")

if eval_cues is not None:
    analysis_possible['cue_analysis'] = True
    print("✅ Cue data available for feedback effectiveness analysis")
else:
    analysis_possible['cue_analysis'] = False
    print("❌ No cue data found")

# Alternative datasets for analysis
if session_logs is not None or rep_logs is not None:
    analysis_possible['general_analysis'] = True
    print("✅ General session/rep data available for basic analysis")
else:
    analysis_possible['general_analysis'] = False
    print("❌ No general session data found")
    


🎯 DATA AVAILABILITY ASSESSMENT
✅ Frame-level data available for temporal analysis
✅ Rep-level data available for accuracy analysis
✅ Cue data available for feedback effectiveness analysis
✅ General session/rep data available for basic analysis
