In [26]:
import os
import json
import pandas as pd
from datetime import datetime, timezone

# Exercise Data

In [27]:
# Main Path
folderExercise = "./raw_data/EMA/response/Exercise/"

In [28]:
# List to collect all rows
all_rows = []

# Loop through all files in the Exercise folder
for filename in os.listdir(folderExercise):
    if filename.startswith("Exercise_u") and filename.endswith(".json"):
        user_id = filename.replace("Exercise_", "").replace(".json", "")
        file_path = os.path.join(folderExercise, filename)
        print(f"Processing file: {filename}")

        with open(file_path, "r") as f:
            try:
                entries = json.load(f)
            except Exception as e:
                print(f"Failed to load {filename}: {e}")
                continue

        for entry in entries:
            # Parse values with fallbacks
            exercise = entry.get("exercise")
            walk = entry.get("walk")
            have = entry.get("have")
            schedule = entry.get("schedule")
            location = entry.get("location", None)
            resp_time = entry.get("resp_time", None)

            # Convert timestamp to datetime
            if resp_time:
                dt = datetime.fromtimestamp(resp_time, tz=timezone.utc)

                readable_time = dt.isoformat()
                weekday = dt.strftime("%A")
                hour = dt.hour
            else:
                readable_time = None
                weekday = None
                hour = None

            # Handle location
            lat, lon = None, None
            if location and location != "Unknown" and location != "null":
                try:
                    lat_str, lon_str = location.split(",")
                    lat, lon = float(lat_str), float(lon_str)
                except:
                    pass

            row = {
                "user_id": user_id,
                "resp_time": readable_time,
                "weekday": weekday,
                "hour": hour,
                "exercise": int(exercise) if exercise and exercise != "null" else None,
                "walk": int(walk) if walk and walk != "null" else None,
                "have": int(have) if have and have != "null" else None,
                "schedule": int(schedule) if schedule and schedule != "null" else None,
                "latitude": lat,
                "longitude": lon,
                "has_location": int(location not in ["Unknown", "null", None]),
            }

            all_rows.append(row)

Processing file: Exercise_u00.json
Processing file: Exercise_u57.json
Processing file: Exercise_u41.json
Processing file: Exercise_u16.json
Processing file: Exercise_u36.json
Processing file: Exercise_u20.json
Processing file: Exercise_u17.json
Processing file: Exercise_u56.json
Processing file: Exercise_u01.json
Processing file: Exercise_u30.json
Processing file: Exercise_u51.json
Processing file: Exercise_u47.json
Processing file: Exercise_u10.json
Processing file: Exercise_u46.json
Processing file: Exercise_u50.json
Processing file: Exercise_u07.json
Processing file: Exercise_u27.json
Processing file: Exercise_u31.json
Processing file: Exercise_u49.json
Processing file: Exercise_u08.json
Processing file: Exercise_u24.json
Processing file: Exercise_u32.json
Processing file: Exercise_u12.json
Processing file: Exercise_u45.json
Processing file: Exercise_u53.json
Processing file: Exercise_u04.json
Processing file: Exercise_u05.json
Processing file: Exercise_u52.json
Processing file: Exe

In [29]:
# Create a DataFrame
df_exercise = pd.DataFrame(all_rows)
df_exercise = df_exercise.sort_values(
    by=["user_id", "resp_time"]
)  # sort by user + time
print(df_exercise.columns)

Index(['user_id', 'resp_time', 'weekday', 'hour', 'exercise', 'walk', 'have',
       'schedule', 'latitude', 'longitude', 'has_location'],
      dtype='object')


In [30]:
# Save to CSV
print("Saving data to 'CleanExerciseData.csv'")
df_exercise.to_csv("CleanExerciseData.csv", index=False)
print("Data saved to 'CleanExerciseData.csv'")

Saving data to 'CleanExerciseData.csv'
Data saved to 'CleanExerciseData.csv'


# Exercise Data

In [31]:
# Path to the folder containing Social_u00.json to Social_u59.json
folderExercise = "./raw_data/EMA/response/Social/"

In [32]:
# Store cleaned entries
all_social_data = []

# Loop through all relevant files
for filename in os.listdir(folderExercise):
    if filename.startswith("Social_u") and filename.endswith(".json"):
        user_id = filename.replace("Social_", "").replace(".json", "")
        file_path = os.path.join(folderExercise, filename)

        with open(file_path, "r") as f:
            try:
                entries = json.load(f)
            except Exception as e:
                print(f"Could not load {filename}: {e}")
                continue

        for entry in entries:
            number = entry.get("number")
            location = entry.get("location")
            resp_time = entry.get("resp_time")

            # Convert timestamp to UTC datetime
            try:
                dt = datetime.fromtimestamp(resp_time, tz=timezone.utc)
            except:
                dt = None

            all_social_data.append(
                {
                    "user_id": user_id,
                    "resp_time": resp_time,
                    "datetime_utc": dt,
                    "number": number,
                    "location": location,
                }
            )

In [33]:
# Convert to DataFrame
df_social = pd.DataFrame(all_social_data)
df_social = df_social.sort_values(by=["user_id", "datetime_utc"])  # sort by user + time

In [34]:
# Save to CSV
print("Saving data to 'CleanSocialData.csv'")
df_social.to_csv("CleanSocialData.csv", index=False)
print("Data saved to 'CleanSocialData.csv'")

Saving data to 'CleanSocialData.csv'
Data saved to 'CleanSocialData.csv'


# Sleep

In [35]:
# Folder where Sleep_uXX.json files are located
folderSleep = "./raw_data/EMA/response/Sleep/"

In [36]:
# Collect cleaned records
all_sleep_data = []

# Go through all Sleep_uXX.json files
for filename in os.listdir(folderSleep):
    if filename.startswith("Sleep_u") and filename.endswith(".json"):
        user_id = filename.replace("Sleep_", "").replace(".json", "")
        file_path = os.path.join(folderSleep, filename)

        with open(file_path, "r") as f:
            try:
                entries = json.load(f)
            except Exception as e:
                print(f"Could not load {filename}: {e}")
                continue

        for entry in entries:
            sleep_hours = entry.get("hour")
            sleep_quality = entry.get("rate")
            sleepiness = entry.get("social")  # Trouble staying awake
            location = entry.get("location")
            resp_time = entry.get("resp_time")

            # Convert timestamp to UTC datetime
            try:
                dt = datetime.fromtimestamp(resp_time, tz=timezone.utc)
            except:
                dt = None

            all_sleep_data.append(
                {
                    "user_id": user_id,
                    "resp_time": resp_time,
                    "datetime_utc": dt,
                    "sleep_hours": sleep_hours,
                    "sleep_quality": sleep_quality,
                    "sleepiness": sleepiness,
                    "location": location,
                }
            )

In [37]:
# Convert to DataFrame
df_sleep = pd.DataFrame(all_sleep_data)
df_sleep = df_sleep.sort_values(by=["user_id", "datetime_utc"])  # sort by user + time

In [38]:
# Save to CSV
print("Saving data to 'CleanSleepData.csv'")
df_sleep.to_csv("CleanSleepData.csv", index=False)
print("Data saved to 'CleanSleepData.csv'")

Saving data to 'CleanSleepData.csv'
Data saved to 'CleanSleepData.csv'
