# Plotting Samsung Health Data

## Setup and Function Definition

The first row always contains a metadata line.
The actual header appears in the next row.

Each data row may start with an empty leading column (a leading comma),
then pandas will assume that first column is the DataFrame index,
shifting all columns left so values end up under misaligned headers.

Curiously, that happens even if the header has a name for it.

In [None]:
# Use the Matplotlib inline magic command (if not already set)
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from collections import namedtuple
from tabulate import tabulate

samsung_dump_dir = 'samsunghealth_data'
samsung_csv_paths = glob(os.path.join(samsung_dump_dir, '*.csv'))

# Define the namedtuple structure for time-series statistics
TimeSeriesResult = namedtuple('TimeSeriesResult', 'mean max min')

def check_jupyter_environment():
    """
    Checks if the code is running within an IPython/Jupyter environment.

    Detecting the environment is often necessary to control output and behavior:
    - Display Formatting: Choosing between print() (standard terminal)
      and display() (rich HTML tables/Markdown in Jupyter).
    - Progress Bars: Using tqdm.notebook (for Jupyter) instead of tqdm (for terminal).
    - Enabling or disabling interactive widgets that only work in notebook environments.
    - File Paths: Handling relative file paths differently
      if the script is run from a terminal versus within a notebook cell.
    """
    # The function get_ipython() is defined when running in any IPython shell,
    # including Jupyter Notebooks and JupyterLab.
    try:
        # Check if the global function get_ipython() exists
        shell = get_ipython().__class__.__name__  # type: ignore to hush Pylance/Pyright
        
        # 'ZMQInteractiveShell' is used by Jupyter Notebook/Lab
        # 'TerminalInteractiveShell' is used by standard IPython console
        if 'TerminalInteractiveShell' in shell or 'ZMQInteractiveShell' in shell:
            return True
        else:
            return False
            
    except NameError:
        # get_ipython is not defined, so it's likely a standard Python interpreter
        return False

is_running_in_jupyter = check_jupyter_environment()


if is_running_in_jupyter:
    from IPython.display import display, Markdown, HTML


# Define an anonymous function that reads CSV files
# using the pandas library with these specific arguments:
#   `skiprows=1`: ignore the first metadata line.
#   `index_col=False`: ensure the first column is treated as a regular data column.
sam_readcsv = lambda x: pd.read_csv(x, skiprows=1, index_col=False)


def get_samsung_csv_path(sample_base_name):
    """
    Retrieves the full path of a Samsung Health log file based on its base name.

    Searches the list of all CSV paths (`samsung_csv_paths`) for a file
    whose basename contains the given `sample_base_name`. This is useful
    for matching a log type (e.g., 'heart_rate') regardless of its 
    extracted datetime stamp suffix.

    Returns:
        str: The full path to the first matching file, or empty if not found.
    """
    for samples_full_path in samsung_csv_paths:
        if sample_base_name in os.path.basename(samples_full_path):
            return samples_full_path
    return ''  # not found


def csv_to_time_series_df(csv_file_path, timestamp_col_name):
    """
        Loads a CSV file into a time-series pandas DataFrame.

        The function first loads the data using the custom 'sam_readcsv' loader, 
        ensures the specified timestamp column is converted to the proper datetime type,
        and then sets this column as the DataFrame index for time-series analysis.

        Args:
            csv_file_path (str): The full path to the CSV file to be loaded.
            timestamp_col_name (str): The exact name of the column containing the 
                timestamp values.

        Returns:
            pandas.DataFrame: A DataFrame indexed by the converted datetime column, 
                ready for time-series plotting or manipulation.
    """
    # Loads the specified CSV file into a pandas Data Frame
    df = sam_readcsv(csv_file_path)

    # Convert the timestamp column to datetime objects
    df[timestamp_col_name] = pd.to_datetime(df[timestamp_col_name])

    # Set the timestamp column as the DataFrame index and return
    return df.set_index(timestamp_col_name)


def get_time_series_stat(time_series_df, column_name, start_time, end_time):
    """
    Compute statistics analyzing the column_name in a time interval,
    assuming time-series DataFrame has a DatetimeIndex.

    Returns:
        TimeSeriesResult
    """
    # Selects all rows where the index (timestamp) falls between the two times
    # using boolean indexing to filter rows
    interval_data = time_series_df[
        (time_series_df.index >= start_time) & 
        (time_series_df.index <= end_time)
    ]

    # Select the specific column data within that interval
    interval_series = interval_data[column_name]

    # Compute and return the statistics
    return TimeSeriesResult(mean=interval_series.mean(), max=interval_series.max(), min=interval_series.min())


def display_stat(time_series_df, column_name, start_time, end_time):
    # Ensure the times are converted to the correct pandas Timestamp objects
    start_timestamp = pd.to_datetime(start_time)
    end_timestamp = pd.to_datetime(end_time)

    # Format date to European (DD/MM/YYYY)
    start_date = start_timestamp.strftime("%d/%m/%Y")
    end_date = end_timestamp.strftime("%d/%m/%Y")

    display(Markdown(f'--- Statistics for **{column_name.split(".")[-1].strip()}** from {start_date} to {end_date} ---'))
    statistics = get_time_series_stat(time_series_df, column_name, start_timestamp, end_timestamp)

    # Printing Statistics as a Formatted Table
    # Separator length is according to f-strings spacing and decimal precision
    table_line_separator = "-" * (5 + 3 + 5 + 3 + 6)
    # Print the table header
    print(f'{"Min":<5} | {"Max":<5} | {"Mean":<6}')
    print(table_line_separator)
    # Print the row
    print(f'{statistics.min:<5.0f} | {statistics.max:<5.0f} | {statistics.mean:<6.0f}')
    print(table_line_separator)


def display_statistics(data_frame):
    """
    Displays a DataFrame as a nicely formatted.
    Uses rich rendering in Jupyter and clean text rendering in console.

    Args:
        data_frame (pd.DataFrame): The DataFrame to be displayed.
    """
    if is_running_in_jupyter:
        # --- JUPYTER/IPYTHON OUTPUT (Rich HTML tables/Markdown) ---

        # Display also the row number (the DataFrame index)
        #display(statistics_df)
        # or
        # Omit the row number (the DataFrame index) when displaying:
        #   1. Use the index=False converting the DataFrame to HTML
        #   2. Display the resulting HTML in the notebook
        html_output = data_frame.to_html(index=False)
        display(HTML(html_output))

    else:
        # --- STANDARD CONSOLE OUTPUT (Plain Text) ---

        # Use tabulate for clean, formatted ASCII table printing
        # headers='keys' uses the column names as the header
        # tablefmt='grid' creates a neat, boxed table
        table_output = tabulate(
            data_frame, 
            headers='keys', 
            tablefmt='grid',
            showindex=False # Explicitly tells tabulate to omit the pandas index
        )
        
        print(table_output)
        # print("-" * (len(table_output.split('\n')[0]))) # Print separator based on table width
    

def display_stat_table(time_series_df, column_name, period_list):
    statistics_df = pd.DataFrame(columns=['Observation Period', 'Minimum', 'Maximum', 'Average'])
    for period in period_list:
        # Ensure the times are converted to the correct pandas Timestamp objects
        start_timestamp = pd.to_datetime(period[0])
        end_timestamp = pd.to_datetime(period[1])

        statistics = get_time_series_stat(time_series_df, column_name, start_timestamp, end_timestamp)

        # Format date to European (DD/MM/YYYY)
        start_date = start_timestamp.strftime("%d/%m/%Y")
        end_date = end_timestamp.strftime("%d/%m/%Y")

        # Append results
        period_str = f'{start_date} - {end_date}'
        statistics_df.loc[len(statistics_df)] = [period_str, round(statistics.min), round(statistics.max), round(statistics.mean)]
    
    display_statistics(statistics_df)
    

def samsung_plot(time_series_df, size_tuple, banner, y_axis_name, column_names_list):
    """
    Generate a time-series plot using the timestamp for the x-axis
    and plotting the remaining selected data columns on the y-axis of the same axes set.
    """
    # Select columns for plotting
    plot_data = time_series_df[column_names_list].copy()

    # Create the figure and axes
    plt.figure(figsize=size_tuple)

    # Plot columns. Pandas uses the column names as the legend labels.
    ax = plot_data.plot(
        title=banner,
        grid=True,
        figsize=size_tuple  # Re-specifying size just in case
    )

    # Rename the legend labels for clarity
    labels = []
    # If a column name consists of dot-separated components, only the last is checked.
    for column_name in column_names_list:
        labels.append(column_name.split(".")[-1].strip())
    ax.legend(labels)

    # Label the axes
    plt.xlabel("Time")
    plt.ylabel(y_axis_name)

    # Display the plot
    plt.show()


## Pandas `strftime` issue using Pylance/Pyright

Pylance is the default language server for Python in VS Code (powered by Microsoft's Pyright type checker).

In the following code snippets, you can find patterns like this:
```Python
# Convert the time column to datetime objects
df[time_column_name] = pd.to_datetime(df[time_column_name])

# Set the datetime column as the DataFrame index
df = df.set_index(time_column_name)

# Format Datetime to European (DD/MM/YYYY HH.MM.SS)
df[time_column_name] = df.index.strftime('%d/%m/%Y %H.%M.%S')
```
In the above last statement, Pyright highlights `strftime` as unknown attribute even if the code executes successfully.

### Why Pylance/Pyright Think It's Unknown (Static Analysis)
Pylance/Pyright perform static code analysis. This means they analyze your code without running it. They read the code and try to figure out the type of every variable based on the imports and function calls.

1. `df.index`: The `.index` attribute of a pandas DataFrame typically returns a generic index type, such as a `RangeIndex` or an `Int64Index`.

2. Pyright sees the `.index` and correctly identifies its type. Generic Index objects do not have a `.strftime()` method.

3. Because Pyright doesn't know the exact runtime data (it only sees the static code), it flags an `unknown attribute` error, assuming the `strftime` method doesn't exist on the generic index type.

### Why It Executes Successfully (Runtime Execution)
The code works because of pandas' specialized functionality for Datetime Indexes:

1. **Index Type**: The `df.index` object is not a generic index; it is a `DatetimeIndex` (because you successfully used `set_index(time_column_name)` with a converted datetime column earlier in your script).

2. **`DatetimeIndex` Method**: Unlike generic indexes, the `DatetimeIndex` object does have a native `.strftime()` method specifically designed to format its datetime elements.

3. **Jupyter Execution**: When you run the cell in Jupyter, Python executes the code, recognizes the `df.index` as a `DatetimeIndex`, finds the correct `.strftime()` method, and executes it without error.

### How to Silence Pylance/Pyright
A. Use Pyright Comment to Ignore the Line Completely

Add this comment to the end of the problematic line:
```Python
df[time_column_name] = df.index.strftime('%d/%m/%Y %H.%M.%S') # type: ignore
```

B. Ignore Specific Error Code

If you find the specific error code Pylance is raising (e.g., in the VS Code "Problems" panel), you can target it:
```Python
# Assuming the error is 'reportAttributeAccessIssue'
df[time_column_name] = df.index.strftime('%d/%m/%Y %H.%M.%S') # type: ignore [reportAttributeAccessIssue]
```

**NOTE**: If you use Pylint as static analyzers, the special comment to add is the following:
```Python
df[time_column_name] = df.index.strftime('%d/%m/%Y %H.%M.%S')  # pylint: disable=no-member
```

C. Pythonic Resolution

While the current code is correct for a `DatetimeIndex`, the more general pandas way to access datetime properties is through the `.dt.strftime()` method,
which is used on a Series of datetime objects. While this requires the index to be converted back to a Series or column,
it is a very common pattern that is less likely to confuse static analyzers
(**though not strictly necessary here** since you were on a `DatetimeIndex`):
```Python
# Convert index back to a Series to use the standard .dt accessor
df[time_column_name] = df.index.to_series().dt.strftime('%d/%m/%Y %H.%M.%S')
```


## Heart rate

### Plotting

In [None]:
heart_rate_csv = get_samsung_csv_path('com.samsung.shealth.tracker.heart_rate.')

# Select the relevant columns
hr_column_name = 'com.samsung.health.heart_rate.heart_rate'
time_column_name = 'com.samsung.health.heart_rate.end_time'

heart_rate_df = csv_to_time_series_df(heart_rate_csv, time_column_name)
start_period_timestamp = heart_rate_df.index.min()
end_period_timestamp = heart_rate_df.index.max()
plot_title = f'Heart Rate (BPM) Statistics from {start_period_timestamp.strftime("%d/%m/%Y")} to {end_period_timestamp.strftime("%d/%m/%Y")}'

display(Markdown(f"### {plot_title}"))

# Show statistics from the whole time interval
# display_stat(heart_rate_df, hr_column_name, start_period_timestamp, end_period_timestamp)

# Show statistics from partial time interval
# display_stat(heart_rate_df, hr_column_name, start_period_timestamp, '2025-10-11')
# display_stat(heart_rate_df, hr_column_name, '2025-10-12', end_period_timestamp)
display_stat_table(heart_rate_df, hr_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])
display_stat_table(heart_rate_df, hr_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp),
])

# Plot
display(Markdown(f"### Plotting Data Source: **{os.path.basename(heart_rate_csv)}**"))
samsung_plot(heart_rate_df,
             (11, 6),
             plot_title,
             'Heart Rate (BPM)',
             [hr_column_name]
             )

### Data export with European formatting

In [None]:
output_file_name = 'heart_rate_european_format.csv'

# Use the prepared blood pressure DataFrame
hr_export_df = heart_rate_df.copy()

# Format Datetime to European (DD/MM/YYYY HH.MM.SS)
time_format_col = 'Timestamp (European Format)'
hr_export_df[time_format_col] = hr_export_df.index.strftime('%d/%m/%Y %H.%M.%S')  # type: ignore [reportAttributeAccessIssue]

# Reset index to make the BP columns accessible as data columns
hr_export_df = hr_export_df.reset_index(drop=True)

# Convert numerics to string and replace decimal point
new_hr_col = 'Heart Rate (bpm)'
hr_export_df[new_hr_col] = hr_export_df[hr_column_name].astype(str).str.replace('.', ',', regex=False)

# Select and reorder the final columns
final_cols = [time_format_col, new_hr_col]
br_final_df = hr_export_df[final_cols]

# Use a semicolon (;) as the delimiter for European format compatibility
br_final_df.to_csv(output_file_name, index=False, sep=';')

print(f"Successfully exported data to: {output_file_name}")
print("First few rows of the exported CSV (note the comma decimals):")
display(br_final_df)


## Heart rate during exercise

### Plotting

In [None]:
heart_rate_csv = get_samsung_csv_path('com.samsung.shealth.exercise.')

# Select the relevant columns
mean_column_name = 'com.samsung.health.exercise.mean_heart_rate'
max_column_name = 'com.samsung.health.exercise.max_heart_rate'
min_column_name = 'com.samsung.health.exercise.min_heart_rate'
time_column_name = 'com.samsung.health.exercise.end_time'

heart_rate_df = csv_to_time_series_df(heart_rate_csv, time_column_name)
start_period_timestamp = heart_rate_df.index.min()
end_period_timestamp = heart_rate_df.index.max()
plot_title = f'Heart Rate (BPM) Statistics During Exercise from {start_period_timestamp.strftime("%d/%m/%Y")} to {end_period_timestamp.strftime("%d/%m/%Y")}'

display(Markdown(f"### {plot_title}"))

# Show statistics from the whole time interval
# display_stat(heart_rate_df, min_column_name, start_period_timestamp, end_period_timestamp)
# display_stat(heart_rate_df, max_column_name, start_period_timestamp, end_period_timestamp)

# Show statistics from partial time interval
# display_stat(heart_rate_df, min_column_name, start_period_timestamp, '2025-10-11')
# display_stat(heart_rate_df, max_column_name, start_period_timestamp, '2025-10-11')

# display_stat(heart_rate_df, min_column_name, '2025-10-12', end_period_timestamp)
# display_stat(heart_rate_df, max_column_name, '2025-10-12', end_period_timestamp)

display(Markdown(f'--- **Minimum** Hear Rate Summary Table ---'))
display_stat_table(heart_rate_df, min_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])

display(Markdown(f'--- **Maximum** Hear Rate Summary Table ---'))
display_stat_table(heart_rate_df, max_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])

display(Markdown(f"### Plotting Data Source: **{os.path.basename(heart_rate_csv)}**"))
samsung_plot(heart_rate_df,
             (11, 7),
             plot_title,
             'Heart Rate (BPM)',
             [min_column_name, max_column_name]
             )

## Systolic (SYS) and diastolic (DIA) blood pressure

### Plotting

In [None]:
blood_pressure_csv = get_samsung_csv_path('com.samsung.shealth.blood_pressure.')

# Select the relevant columns
sys_column_name = 'com.samsung.health.blood_pressure.systolic'
dia_column_name = 'com.samsung.health.blood_pressure.diastolic'
pulse_column_name = 'com.samsung.health.blood_pressure.pulse'
time_column_name = 'com.samsung.health.blood_pressure.update_time'

blood_pressure_df = csv_to_time_series_df(blood_pressure_csv, time_column_name)
start_period_timestamp = blood_pressure_df.index.min()
end_period_timestamp = blood_pressure_df.index.max()
plot_title = f'Blood Pressure Over Time (Systolic and Diastolic) {start_period_timestamp.strftime("%d/%m/%Y")} to {end_period_timestamp.strftime("%d/%m/%Y")}'

display(Markdown(f"### {plot_title}"))

# Show statistics from the whole time interval
# display_stat(blood_pressure_df, sys_column_name, start_period_timestamp, end_period_timestamp)
# display_stat(blood_pressure_df, dia_column_name, start_period_timestamp, end_period_timestamp)
# display_stat(blood_pressure_df, pulse_column_name, start_period_timestamp, end_period_timestamp)

# Show statistics from partial time interval
# display_stat(blood_pressure_df, sys_column_name, start_period_timestamp, '2025-10-11')
# display_stat(blood_pressure_df, dia_column_name, start_period_timestamp, '2025-10-11')
# display_stat(blood_pressure_df, pulse_column_name, start_period_timestamp, '2025-10-11')

# display_stat(blood_pressure_df, sys_column_name, '2025-10-12', end_period_timestamp)
# display_stat(blood_pressure_df, dia_column_name, '2025-10-12', end_period_timestamp)
# display_stat(blood_pressure_df, pulse_column_name, '2025-10-12', end_period_timestamp)

display(Markdown(f'--- **Systolic (mmHg)** Summary Table ---'))
display_stat_table(blood_pressure_df, sys_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])

display(Markdown(f'--- **Diastolic (mmHg)** Summary Table ---'))
display_stat_table(blood_pressure_df, dia_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])

display(Markdown(f'--- **Heart Rate (BPM)** Summary Table ---'))
display_stat_table(blood_pressure_df, pulse_column_name, [
    (start_period_timestamp, '2025-10-11'),
    ('2025-10-12', end_period_timestamp)
])

display(Markdown(f"### Plotting Data Source: **{os.path.basename(blood_pressure_csv)}**"))
samsung_plot(blood_pressure_df,
             (11, 7),
             plot_title,
             'Blood Pressure (mmHg)',
             [sys_column_name, dia_column_name, pulse_column_name]
             )

### Data export with European formatting

In [None]:
output_file_name = 'blood_pressure_european_format.csv'
    
# Use the prepared blood pressure DataFrame
bp_export_df = blood_pressure_df.copy()

# Format Datetime to European (DD/MM/YYYY HH.MM.SS)
time_format_col = 'Timestamp (European Format)'
bp_export_df[time_format_col] = bp_export_df.index.strftime('%d/%m/%Y %H.%M.%S')  # type: ignore [reportAttributeAccessIssue]

# Reset index to make the BP columns accessible as data columns
bp_export_df = bp_export_df.reset_index(drop=True)

# Convert numerics to string and replace decimal point
new_sys_col = 'Systolic (mmHg)'
new_dia_col = 'Diastolic (mmHg)'
new_pulse_col = 'Heart Rate (bpm)'
bp_export_df[new_sys_col] = bp_export_df[sys_column_name].astype(str).str.replace('.', ',', regex=False)
bp_export_df[new_dia_col] = bp_export_df[dia_column_name].astype(str).str.replace('.', ',', regex=False)
bp_export_df[new_pulse_col] = bp_export_df[pulse_column_name].astype(str).str.replace('.', ',', regex=False)

# Select and reorder the final columns
final_cols = [time_format_col, new_sys_col, new_dia_col, new_pulse_col]
bp_final_df = bp_export_df[final_cols]

# Use a semicolon (;) as the delimiter for European format compatibility
bp_final_df.to_csv(output_file_name, index=False, sep=';')

print(f"Successfully exported data to: {output_file_name}")
print("First few rows of the exported CSV (note the comma decimals):")
display(bp_final_df)
