<a href="https://colab.research.google.com/github/ekrombouts/GenCareAI/blob/main/scripts/150_CleanUpNursingHome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GenCare AI: Concat and clean data

**Author:** Eva Rombouts  
**Date:** 2024-07-11  
**Version:** 1.0

### Description
This script concatenates the profiles, scenario's, client records and summaries generated in different experiments (each stored as different 'wards') and restructures and cleans up the data.

The result is a set of 4 datasets:
1. Galaxy_clients
2. Galaxy_scenarios
3. Galaxy_records
4. Galaxy_summaries

In [None]:
import os
# Determines the current environment (Google Colab or local)
def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        pass

    return "Local Environment"

In [None]:
# Installs and settings depending on the environment
# When running in CoLab, the Google drive is mounted and necessary packages are installed.
# Data paths are set and API keys retrieved

env = check_environment()

if env == "Google Colab":
    print("Running in Google Colab")
    !pip install datasets -q
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    DATA_DIR = '/content/drive/My Drive/Colab Notebooks/GenCareAI/data'
else:
    print("Running in Local Environment")
    # !pip install
    DATA_DIR = '../data'


In [None]:
import pandas as pd
import ast

from datasets import Dataset

In [None]:
CARE_HOME = 'Galaxy'
wards = ['Horizon', 'Venus', 'Tulip', 'Cosmos']

FN_CLIENTS_DF = os.path.join(DATA_DIR, f'gcai_{CARE_HOME}_clients.csv')
FN_SCENARIOS_DF = os.path.join(DATA_DIR, f'gcai_{CARE_HOME}_scenarios.csv')
FN_RECORDS_DF = os.path.join(DATA_DIR, f'gcai_{CARE_HOME}_records.csv')
FN_SUMMARIES_DF = os.path.join(DATA_DIR, f'gcai_{CARE_HOME}_summaries.csv')

hf_repo_name = "ekrombouts/" + CARE_HOME

In [None]:
def concatenate_files(data_dir, wards, file_prefix):
    df_list = []
    for ward in wards:
        file_name = os.path.join(data_dir, f'{file_prefix}_{ward}.csv')
        if os.path.exists(file_name):
            df = pd.read_csv(file_name)
            df['ward'] = ward
            df_list.append(df)
        else:
            print(f"Warning: File {file_name} does not exist and will be skipped.")

    if df_list:
        concatenated_df = pd.concat(df_list, ignore_index=True)
        columns = ['ward'] + [col for col in concatenated_df.columns if col != 'ward']
        concatenated_df = concatenated_df[columns]
        return concatenated_df
    else:
        print("No files to concatenate.")
        return pd.DataFrame()  # Return an empty DataFrame if no files were found

In [None]:
# Function to process client data
# def process_clients(df_clients, df_scenarios):
#     # Create a unique client ID
#     df_clients['ct_id'] = df_clients.index + 1
#     # Get the complications and the number of months. These are duplicate rows in the scenarios df
#     unique_scen = df_scenarios[['ward', 'client_id', 'complications', 'num_months']].drop_duplicates()
#     df_clients = df_clients.merge(unique_scen, on=['ward', 'client_id'], how='left')
#     return df_clients

def process_clients(df_clients, df_scenarios):
    df_clients = (df_clients
                  .assign(ct_id=df_clients.index + 1)
                  .merge(df_scenarios[['ward', 'client_id', 'complications', 'num_months']].drop_duplicates(),
                         on=['ward', 'client_id'], how='left'))
    return df_clients

def rename_client_columns(df_clients):
    # Rename and reorder columns for df_clients. 
    df_clients = (df_clients
                  .rename(columns={
                      'naam': 'name',
                      'type_dementie': 'dementia_type',
                      'somatiek': 'physical',
                      'adl': 'adl',
                      'mobiliteit': 'mobility',
                      'gedrag': 'behavior',
                  })
                  [['ct_id', 'ward', 'name', 'dementia_type', 'physical', 'adl',
                    'mobility', 'behavior', 'complications', 'num_months']])
    return df_clients

# Function to process scenarios data
def process_scenarios(df_scenarios, df_clients):
    df_scenarios = (df_scenarios
                    # Complications and num_months have been added to the clients df
                    .drop(columns=['complications', 'num_months'])
                    # Get the unique client ID
                    .merge(df_clients[['ward', 'client_id', 'ct_id']], on=['ward', 'client_id'], how='left')
                    # Extract the month number from the string
                    .assign(month=lambda df: df['month'].str.extract('(\d+)').astype(int))
                    .rename(columns={'journey': 'scenario'})
                    [['ct_id', 'month', 'scenario']])
    return df_scenarios

# Function to process records data
def process_records(df_records, df_clients):
    df_records = (df_records
                  .merge(df_clients[['ward', 'client_id', 'ct_id']], on=['ward', 'client_id'], how='left')
                  .rename(columns={'dag': 'day', 'tijd': 'time', 'rapportage': 'note'})
                  [['ct_id', 'month', 'iteration', 'day', 'time', 'note']])
    return df_records

# Function to process summaries data
def process_summaries(df_summaries, df_clients):
    summaries_list = df_summaries['summary'].tolist()
    dict_list = [ast.literal_eval(s) for s in summaries_list]
    df_parsed_summaries = pd.DataFrame(dict_list).drop(columns=['client_id'])

    df_summaries = (df_summaries
                    .merge(df_clients[['ward', 'client_id', 'ct_id']], on=['ward', 'client_id'], how='left')
                    [['ct_id']])

    df_summaries = pd.concat([df_summaries, df_parsed_summaries], axis=1)
    return df_summaries


In [None]:
# Function to convert DataFrames to Hugging Face Datasets and push to hub
def push_dataset_to_hub(df, dataset_name, hf_repo_name):
    dataset = Dataset.from_pandas(df)
    dataset.push_to_hub(f"{hf_repo_name}_{dataset_name}", private=True)

In [None]:
# Main script execution
def main():
    df_clients = concatenate_files(DATA_DIR, wards, 'gcai_client_profiles')
    df_scenarios = concatenate_files(DATA_DIR, wards, 'gcai_client_scenarios')
    df_records = concatenate_files(DATA_DIR, wards, 'gcai_client_notes')
    df_summaries = concatenate_files(DATA_DIR, wards, 'gcai_client_summaries')

    df_clients = process_clients(df_clients, df_scenarios)
    df_scenarios = process_scenarios(df_scenarios, df_clients)
    df_records = process_records(df_records, df_clients)
    df_summaries = process_summaries(df_summaries, df_clients)
    df_clients = rename_client_columns(df_clients)

    # Save final processed dataframes
    df_clients.to_csv(FN_CLIENTS_DF, index=False)
    df_scenarios.to_csv(FN_SCENARIOS_DF, index=False)
    df_records.to_csv(FN_RECORDS_DF, index=False)
    df_summaries.to_csv(FN_SUMMARIES_DF, index=False)
    
    # Push each dataset to Hugging Face
    push_dataset_to_hub(df_summaries, "summaries", hf_repo_name)
    push_dataset_to_hub(df_records, "records", hf_repo_name)
    push_dataset_to_hub(df_scenarios, "scenarios", hf_repo_name)
    push_dataset_to_hub(df_clients, "clients", hf_repo_name)

In [None]:
if __name__ == "__main__":
    main()