# Agitation prediction: Prediction models

**Author:** Eva Rombouts  
**Date:** 19-07-2024  
**Version:** 1.0 

### Description
In this script, models are trained and evaluated for predicting agitation in nursing home notes. Both Random Forest and Logistic Regression models are trained. 
The dataset used for training and validation was generated using ChatGPT, which explains the excelent performance. For this reason we will focus on false positives and false negatives, to further annotate these in the future. 

In [None]:
# Environment setup
import os

def check_environment():
    try:
        import google.colab
        return "Google Colab"
    except ImportError:
        return "Local Environment"

env = check_environment()
if env == "Google Colab":
    print("Running in Google Colab")
    # !pip install -q
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    os.chdir('/content/drive/My Drive/Colab Notebooks/GenCareAI/scripts')
else:
    print("Running in Local Environment")
    # !pip install -q


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


In [None]:
seed = 6

In [None]:
# Load the datasets
train_df = pd.read_csv('../data/agitation_train.csv')
valid_df = pd.read_csv('../data/agitation_valid.csv')
test_df = pd.read_csv('../data/agitation_test.csv')

In [None]:
# Define feature columns
feature_columns = [col for col in train_df.columns if col.startswith('topic_') or col.startswith('embedding_')]

# Prepare training and validation data
X_train = train_df[feature_columns]
y_train = train_df['label']
X_valid = valid_df[feature_columns]
y_valid = valid_df['label']
X_test = test_df[feature_columns]
y_test = test_df['label']

## Random forest

In [None]:
# Initialize the model
rf_model = RandomForestClassifier(random_state=seed)

# Train the model
rf_model.fit(X_train, y_train)

In [None]:
# Validate the model
y_valid_pred_rf = rf_model.predict(X_valid)
y_valid_pred_proba_rf = rf_model.predict_proba(X_valid)[:, 1]

# Print evaluation metrics
print("Validation Results:")
print("Confusion Matrix:")
cm = confusion_matrix(y_valid, y_valid_pred_rf)
print(cm)
print("\nClassification Report:")
print(classification_report(y_valid, y_valid_pred_rf))
print("\nAccuracy Score:")
print(accuracy_score(y_valid, y_valid_pred_rf))

# Compute AUC and plot ROC curve for validation data
valid_auc = roc_auc_score(y_valid, y_valid_pred_proba_rf)
fpr_valid, tpr_valid, _ = roc_curve(y_valid, y_valid_pred_proba_rf)

plt.figure()
plt.plot(fpr_valid, tpr_valid, color='blue', lw=2, label=f'Validation ROC curve (area = {valid_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Validation ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Display contingency table
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Agitation', 'Agitation'])
cm_display.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Get feature importances
importances = rf_model.feature_importances_
feature_names = feature_columns

# Sort the feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange the feature names so they match the sorted feature importances
sorted_feature_names = [feature_names[i] for i in indices]

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(len(importances)), importances[indices], align="center")
plt.xticks(range(len(importances)), sorted_feature_names, rotation=90)
plt.xlabel("Feature")
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

## Logistic Regression Model

In [None]:
lr_model = LogisticRegression(random_state=seed, max_iter=1000)
lr_model.fit(X_train, y_train)

In [None]:

# Validate Logistic Regression model
print("Validating Logistic Regression model...")
y_valid_pred_lr = lr_model.predict(X_valid)
y_valid_pred_proba_lr = lr_model.predict_proba(X_valid)[:, 1]

# Print evaluation metrics for Logistic Regression
print("Logistic Regression - Validation Results:")
print("Confusion Matrix:")
cm_lr = confusion_matrix(y_valid, y_valid_pred_lr)
print(cm_lr)
print("\nClassification Report:")
print(classification_report(y_valid, y_valid_pred_lr))
print("\nAccuracy Score:")
print(accuracy_score(y_valid, y_valid_pred_lr))

# Compute AUC and plot ROC curve for Logistic Regression validation data
valid_auc_lr = roc_auc_score(y_valid, y_valid_pred_proba_lr)
fpr_valid_lr, tpr_valid_lr, _ = roc_curve(y_valid, y_valid_pred_proba_lr)

plt.figure()
plt.plot(fpr_valid_lr, tpr_valid_lr, color='green', lw=2, label=f'Validation ROC curve (area = {valid_auc_lr:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression - Validation ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Display confusion matrix for Logistic Regression
cm_display_lr = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=['No Agitation', 'Agitation'])
cm_display_lr.plot(cmap=plt.cm.Blues)
plt.title('Logistic Regression - Confusion Matrix')
plt.show()

In [None]:
# Coefficients for Logistic Regression
print("Getting coefficients for Logistic Regression...")
coefficients_lr = lr_model.coef_[0]
indices_lr = np.argsort(coefficients_lr)[::-1]
sorted_feature_names_lr = [feature_names[i] for i in indices_lr]

# Plot the coefficients for Logistic Regression
plt.figure(figsize=(12, 8))
plt.title("Logistic Regression - Coefficients")
plt.bar(range(len(coefficients_lr)), coefficients_lr[indices_lr], align="center")
plt.xticks(range(len(coefficients_lr)), sorted_feature_names_lr, rotation=90)
plt.xlabel("Feature")
plt.ylabel("Coefficient")
plt.tight_layout()
plt.show()

## Mismatched data

In [None]:
# Identify false positives and false negatives in the validation data

# Get predictions and probabilities for both models
valid_df['rf_pred'] = y_valid_pred_rf
valid_df['rf_proba'] = y_valid_pred_proba_rf
valid_df['lr_pred'] = y_valid_pred_lr
valid_df['lr_proba'] = y_valid_pred_proba_lr

# Create a new DataFrame with the relevant columns
results_df = valid_df[['text', 'label'b, 'rf_pred', 'rf_proba', 'lr_pred', 'lr_proba']]

print("Results DataFrame:")
print(results_df.head())

# Save the results to a CSV file (optional)
results_df.to_csv('../data/validation_results.csv', index=False)

In [None]:
# Create a subset where the predictions and the labels do not match
mismatched_df = results_df[(results_df['label'] != results_df['rf_pred']) | (results_df['label'] != results_df['lr_pred'])]

print("Mismatched DataFrame:")
print(mismatched_df.head())


# Save the mismatched results to a CSV file (optional)
mismatched_df.to_csv('../data/mismatched_validation_results.csv', index=False)

In [None]:
mismatched_df[['label', 'rf_pred', 'lr_pred']].value_counts()