<a href="https://colab.research.google.com/github/danielmlow/rallypoint_suicide_detection/blob/main/multimodal_suicide_detector_minimal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model to detect suicidal posts from non-suicidal posts on RallyPoint


### Important: activate GPU in Runtime/Change runtime type, install packages, restart runtime.

### Authors
- Before 2020: Leo (Amazon) and Erik Kastman (RallyPoint and Harvard) built and deployed first models. 
- 2020: Noah re-did models, extended model to include metadata (multimodal) and used a larger dataset. 
- October, 2021: Richard Kuzma reproduced results and fixed dataset by removing duplicate posts. 
- May, 2022: Daniel Low made this minimal script for deployment








# Setting up python 3.7 if using on Colab

In [None]:
## downgrade to python 3.7
# !sudo apt-get install python3.7
# !sudo apt-get update -y
# !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.7 1
# !sudo update-alternatives --config python3

In [None]:
## confirm python version
# !python --version

In [None]:
## reinstall pip
# !sudo apt-get install --reinstall python3.7-distutils
# !sudo apt install python3-pip
# !python -m pip install --upgrade --force-reinstall pip

In [None]:
!pip install -q redis==3.5.0 transformers==4.26.1 multimodal-transformers==0.2a0

In [None]:
import os
import datetime
import pandas as pd
import numpy as np
import json
import transformers
from multimodal_transformers.data import load_data #was: from multimodal_toolkit.multimodal_transformers.data import load_data
from multimodal_transformers.model import AutoModelWithTabular #was from multimodal_toolkit.multimodal_transformers.model import AutoModelWithTabular
from transformers import AutoConfig, AutoTokenizer, Trainer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, recall_score, f1_score, precision_score
from scipy.special import softmax
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

In [None]:
import warnings
warnings.filterwarnings("ignore") #related to sklearn metrics

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
input_dir = '/content/drive/MyDrive/datum/rallypoint_suicide_detection/data/input/' 
output_dir = '/content/drive/MyDrive/datum/rallypoint_suicide_detection/data/output/'
data_path = input_dir+'final_datasets/' #comes from: "RallyPoint Milestone 6 Code/Model notebook/multimodal_toolkit/datasets/rp/"
path_to_checkpoint = input_dir + 'performance/roberta_pretrained_meta_text_num(final)/' #comes from "RallyPoint Milestone 6 Code/Model notebook/Multi-modal Toolkit/logs/"
save_outputs = False #results, plots
config_name = "rp_config(meta).json"
test_dataset_name = "test.csv"


# 1. Load Model

In [None]:
# this took me 30 sec on Google Colab GPU
tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint)
config = AutoConfig.from_pretrained(path_to_checkpoint)
model = AutoModelWithTabular.from_pretrained(path_to_checkpoint, config=config)

trainer = Trainer(model=model)


# 2. Load test data

In [None]:
load_subset = 10 # or int

test = pd.read_csv(data_path + test_dataset_name)
# test.label = test.label.astype('int8')
if load_subset:
  test = test.sample(load_subset)
test.head()

#### 2.1. This the data that is needed


In [None]:
with open(data_path + config_name, "r") as f:
    data_config = dict(json.load(f))

data_config

These are the variables being used:
- 'label': 0 or 1 (integer; which is SITB_ABSENT or SITB_PRESENT)
- 'reputation' (float between 0 and 1)
- 'contact_size' (float between 0 and 1)
- 'type_tag_content': three string variables concatenated. So we need to import these three variables and concatenate them.
    - 'type': categorical, either: 0 (comment), 0.5 (question) or 1 (StatusUpdate)
    - 'tag': list of strings, e.g., ['health', 'veterans', 'affairs', 'benefits' , 'military', 'family' , 'veterans', 'health', 'administration']
    - 'content: string document 
    - Here's an example : 'type| comment tag| health veterans affairs benefits military family veterans health administration body| That is part of why I complain so loudly. I have been trying desperately to get mental and dental health for YEARS. My teeth are so bad now, that its a miracle I have not died from infections.' 

In [None]:
torch_dataset = load_data(
                            data_df = test,
                            text_cols = data_config['text_cols'], #['type_tag_content']
                            tokenizer=tokenizer,
                            categorical_cols= data_config['cat_cols'], #['type']
                            categorical_encode_type=None,
                            numerical_cols= data_config['num_cols'], #['reputation', 'contact_size']
                            sep_text_token_str=tokenizer.sep_token, #</s>
                            label_col=data_config['label_col'], #in the DF, 0s and 1s. 
                            label_list= data_config['label_list'] #['SITB_ABSENT', 'SITB_PRESENT'] not in DF, just what 0 and 1 are.
)

# 3. Predict

In [None]:
# This took 50 sec for 1712 predictions with Google Colab GPU (2022) or 2 sec for 100 predictions. 
prediction_object = trainer.predict(test_dataset=torch_dataset) #this provides values for 0 and 1
predictions = prediction_object.predictions

# Compare different thresholds to label_ids provided in prediction_object output
predictions_softmax = softmax(predictions, axis=1)
y_pred = np.array([np.argmax(elem) for elem in predictions_softmax]) # threshold = 0.5. Take largest (0.51 over 0.49)
prediction_labels_softmax_score = [np.round(n[label],3) for n,label in zip(predictions_softmax,y_pred)]

# Display minimal info: ID, post, y_pred
test['y_pred'] = y_pred
test['y_pred_softmax'] = prediction_labels_softmax_score
test[['y_pred_softmax_sitb-','y_pred_softmax_sitb+']] = predictions_softmax
test[['y_pred_logit_sitb-','y_pred_logit_sitb+']] = predictions
test[['id','reputation', 'contact_size', 'type_tag_content','y_pred_softmax','y_pred']]

In [None]:
ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')
test.to_csv(output_dir+f'predictions_{ts}.csv')