# Agitation prediction: Data cleaning

**Author:** Eva Rombouts  
**Date:** 2024-07-19  
**Version:** 1.0

### Description
This script performs data cleaning on a dataset of nursing home notes. 
The cleaned dataset retains two columns: text and label. The label column is 1 if the topic is 'onrust' (agitation) and 0 otherwise.

In [None]:
import os

def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        return "Local Environment"

env = check_environment()
if env == "Google Colab":
    print("Running in Google Colab")
    !pip install -q datasets
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')
    HF_TOKEN = userdata.get('HF_TOKEN')
else:
    print("Running in Local Environment")
    # !pip install -q matplotlib
    from dotenv import load_dotenv
    load_dotenv()
    HF_TOKEN = os.getenv('HF_TOKEN')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

In [None]:
# Load the dataset from Hugging Face
PATH_HF_DATASET = 'ekrombouts/dutch_nursing_home_notes'
dataset = load_dataset(PATH_HF_DATASET, token=HF_TOKEN)
df = dataset['train'].to_pandas()

WORD_COUNT_MAX = 61

SAMPLE_SIZE = 10000
SEED = 6

In [None]:
# Explore a sample of the data
print(df.sample(5))
print('\n'+ 10*'*'+ '\n')
df.info()

In [None]:
# There are just a couple of missing values. We'll remove these later
df[df.isna().any(axis=1)]

In [None]:
# Exploring large notes yields that these have been incorrectly parsed, so drop
df['note_word_count'] = df['note'].str.split().str.len()

plt.hist(df['note_word_count'], bins=100)
plt.show()
_ = df[df['note_word_count'] > WORD_COUNT_MAX]
print(f"Number of rows with note_word_count > {WORD_COUNT_MAX}: {len(_)}")
[print(10*'*' + '\n' + note) for note in _['note'].sample(4)]

In [None]:
# Clean the dataframe by removing missing values and large notes
df_agitation = (
    df.dropna()
      .drop(df[df['note_word_count'] > WORD_COUNT_MAX].index)
      .assign(label=(df['topic'] == 'onrust').astype(int))
      .rename(columns={'note': 'text'})
      [['text', 'label']]
)

In [None]:
print(df_agitation.sample(5))
print(df_agitation['label'].value_counts())
df_agitation.info()

In [None]:
df_agitation.to_csv('../data/agitation.csv', index=False)