<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/main/notebooks/100_note_generation/150_CleanUpNursingHome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GenCare AI: Concat and clean data

**Author:** Eva Rombouts  
**Date:** 2024-09-02  
**Updated:** 2024-10-10  
**Version:** 2.0

### Description
This notebook consolidates and cleans client data, scenarios, and care records from different wards. It combines data from CSV files and processes it to ensure consistency across all wards. The notebook restructures client IDs, adjusts the day numbering, and assigns proper timestamps to each care note. Finally, it prepares three datasets: clients, scenarios, and records, which can be used for further analysis or uploaded to platforms like Hugging Face for broader use.


In [None]:
!pip install GenCareAI
from GenCareAI.GenCareAIUtils import GenCareAISetup

setup = GenCareAISetup()

if setup.environment == 'Colab':
        !pip install -q datasets

In [2]:
import os
import pandas as pd
from datetime import datetime, timedelta
import random
from datasets import Dataset

In [3]:
care_home = 'Gardenia'
wards = ['Dahlia', 'Magnolia', 'Iris', 'Crocus']

fn_clients_df = setup.get_file_path(f'data/gcai_{care_home}_clients.csv')
fn_scenarios_df = setup.get_file_path(f'data/gcai_{care_home}_scenarios.csv')
fn_records_df = setup.get_file_path(f'data/gcai_{care_home}_records.csv')

hf_repo_name = "ekrombouts/" + care_home

In [4]:
def concatenate_files(wards, file_prefix):
    df_list = []

    # Loop through each ward to find and read the corresponding file
    for ward in wards:
        file_name = setup.get_file_path(f'data/{file_prefix}_{ward}.csv')

        # Check if the file exists
        if os.path.exists(file_name):
            df = pd.read_csv(file_name)
            df['ward'] = ward  # Add a column to indicate the ward
            df_list.append(df)
        else:
            print(f"Warning: File {file_name} does not exist and will be skipped.")

    if df_list:
        concatenated_df = pd.concat(df_list, ignore_index=True)

        # Reorder columns to place 'ward' as the first column
        columns = ['ward'] + [col for col in concatenated_df.columns if col != 'ward']
        concatenated_df = concatenated_df[columns]

        return concatenated_df
    else:
        print("No files to concatenate.")
        return pd.DataFrame()  # Return an empty df if no files were found

df_clients = concatenate_files(wards, 'gcai_client_profiles')
df_scenarios = concatenate_files(wards, 'gcai_client_scenarios')
df_records = concatenate_files(wards, 'gcai_client_notes')


In [5]:
def adjust_client_id(df):
    # Adjust 'client_id' to make it unique
    df['client_id'] = df['ward'].str[:3].str.lower() + df['client_id'].astype(str).str.zfill(3)
    return df

In [6]:
def update_dayno(df):
    # Sort the dataframe by client_id, weekno, dag, and tijd to ensure correct order
    df = df.sort_values(['client_id', 'weekno', 'dag', 'tijd'])

    # Define a function to apply to each client_id group
    def assign_dayno(group):
        group = group.copy()
        # Create a boolean column that is True when 'dag' changes
        group['dag_change'] = group['dag'] != group['dag'].shift()
        # Cumulatively sum the 'dag_change' to get the 'dagnummer'
        group['dagnummer'] = group['dag_change'].cumsum()
        # Adjust so that dagnummer starts at 1
        group['dagnummer'] = group['dagnummer'] - group['dagnummer'].min() + 1
        return group

    # Apply the function to each group
    df = df.groupby('client_id', group_keys=False).apply(assign_dayno)
    # Remove the 'dag_change' column as it is no longer needed
    df = df.drop(columns=['dag_change'])
    # Reset the index if necessary
    df = df.reset_index(drop=True)
    return df

In [None]:
df_clients = adjust_client_id(df_clients)
df_scenarios = adjust_client_id(df_scenarios)
df_records = adjust_client_id(df_records)
df_records = update_dayno(df_records)


In [None]:
df_scenarios.head()

In [9]:
# Function to process client data: Assigns unique ID to clients and merges with scenario data
def process_clients(df_clients, df_scenarios):
    df_clients = (df_clients
                  .merge(df_scenarios[['ward', 'client_id', 'complications', 'num_weeks']].drop_duplicates(),
                         on=['ward', 'client_id'], how='left'))
    return df_clients

df_clients = process_clients(df_clients, df_scenarios)

In [None]:
df_scenarios.head()

In [11]:
# Function to process scenarios data
def process_scenarios(df_scenarios, df_clients):
    df_scenarios = (df_scenarios
                    # Complications and num_months have been added to the clients df
                    .drop(columns=['complications', 'num_weeks'])
                    # Get the unique client ID
                    .merge(df_clients[['ward', 'client_id']], on=['client_id'], how='left')
                    # Extract the month number from the string
                    .rename(columns={'events_description': 'scenario'})
                    [['client_id', 'week', 'scenario']])
    return df_scenarios

df_scenarios = process_scenarios(df_scenarios, df_clients)

In [12]:
def process_records(df_records, df_clients):
    # Generate a random start date for each client_id
    unique_client_ids = df_clients['client_id'].unique()
    start_dates = {client_id: datetime(2022, 1, 1) + timedelta(days=random.randint(0, 365)) for client_id in unique_client_ids}

    # Calculate the maximum 'dag' (day) value for each 'client_id'
    max_days_per_client_id = df_records.groupby('client_id')['dag'].max().to_dict()

    # Function to calculate the date using the start_date for each client_id and max 'dag' value
    def calculate_datetime(row):
        max_day = max_days_per_client_id.get(row['client_id'], 15)  # Use the max 'dag' value or default to 15
        start_date = start_dates[row['client_id']]  # Get the start date for the specific client_id
        # Combine date and time
        base_date = start_date + timedelta(days=(row['weekno'] - 1) * max_day + (row['dag'] - 1))
        # Keep only digits in the time string
        tijd_str = str(row['tijd'])
        time_digits = ''.join(filter(str.isdigit, tijd_str))
        # Handle missing or invalid time
        if len(time_digits) == 3:
            time_digits = '0' + time_digits  # Pad with zero if time is of form '800' instead of '0800'
        if len(time_digits) != 4:
            # Default to midnight if time is invalid
            time_delta = datetime.min.time()
        else:
            # Convert the cleaned time string to a time object using the '%H%M' format
            time_dt = pd.to_datetime(time_digits, format='%H%M', errors='coerce')
            if pd.isnull(time_dt):
                # Default to midnight if parsing fails
                time_delta = datetime.min.time()
            else:
                time_delta = time_dt.time()
        return datetime.combine(base_date, time_delta)

    # Apply the datetime calculation to each record and update the DataFrame
    df_records = (df_records
                  .assign(datetime=lambda df: df.apply(
                      lambda row: calculate_datetime(row), axis=1))
                  .rename(columns={'rapportage': 'note'})
                  [['client_id', 'datetime', 'note']])

    return df_records

df_records = process_records(df_records, df_clients)

In [14]:
def rename_client_columns(df_clients):
    # Rename and reorder columns for df_clients.
    df_clients = (df_clients
                  .rename(columns={
                      'naam': 'name',
                      'type_dementie': 'dementia_type',
                      'somatiek': 'physical',
                      'adl': 'adl',
                      'mobiliteit': 'mobility',
                      'gedrag': 'behavior',
                  })
                  [['client_id', 'ward', 'name', 'dementia_type', 'physical', 'adl',
                    'mobility', 'behavior', 'complications', 'num_weeks']])
    return df_clients

df_clients = rename_client_columns(df_clients)


In [15]:
# Save final processed dataframes
df_clients.to_csv(fn_clients_df, index=False)
df_scenarios.to_csv(fn_scenarios_df, index=False)
df_records.to_csv(fn_records_df, index=False)


In [None]:
# # Function to convert DataFrames to Hugging Face Datasets and push to hub
# def push_dataset_to_hub(df, dataset_name, hf_repo_name):
#     dataset = Dataset.from_pandas(df)
#     dataset.push_to_hub(f"{hf_repo_name}_{dataset_name}", private=True)

# push_dataset_to_hub(df_records, "records", hf_repo_name)
# push_dataset_to_hub(df_scenarios, "scenarios", hf_repo_name)
# push_dataset_to_hub(df_clients, "clients", hf_repo_name)