# Careplan generation: Data cleaning

**Author:** Eva Rombouts  
**Date:** 2024-07-21  
**Version:** 0.2

### Description


In [None]:
# Environment setup
import os

def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        return "Local Environment"

env = check_environment()

if env == "Google Colab":
    print("Running in Google Colab")
    !pip install -q datasets
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')
    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    print("Running in Local Environment")
    # !pip install -q
    from dotenv import load_dotenv
    load_dotenv()
    HF_TOKEN = os.getenv('HF_TOKEN')

In [None]:
import pandas as pd
from datasets import load_dataset

In [None]:
dataset = load_dataset("ekrombouts/Galaxy_records", token=HF_TOKEN)
df_records = dataset['train'].to_pandas()

df = (
    df_records
    .dropna(subset=['note']) 
    .groupby(['ct_id', 'month', 'iteration'])['note']
    .apply(lambda x: '- ' + '\n- '.join(x))
    .reset_index(name='notes')
)

print(df.head())
print(100*'*')
print(df['notes'][0])

In [None]:
df.to_csv('../data/df_weeknotes.csv')