# Sentence sentiment

This is run after 02_notes-to-sentences.ipynb.



## Setup

In [None]:
import os, re, time, sys
print("Python executable:", sys.executable)

# Ensure numpy is single-core, so that parallel processing does not conflict for cpu.
# This is needed because otherwise stanza will use multiple cores.
max_threads = "1"
os.environ["OMP_NUM_THREADS"] = max_threads 
os.environ["OPENBLAS_NUM_THREADS"] = max_threads
os.environ["MKL_NUM_THREADS"] = max_threads
os.environ["VECLIB_MAXIMUM_THREADS"] = max_threads 
os.environ["NUMEXPR_NUM_THREADS"] = max_threads
#import mkl
#mkl.set_num_threads(int(max_threads))

import pyprojroot
import pandas as pd, numpy as np
#print("Numpy BLAS:", np.__config__.show())
import itertools
from matplotlib import pyplot as plt
from collections import Counter

import medspacy, spacy
from medspacy.ner import TargetRule 
from medspacy.visualization import visualize_ent

# Parallel processing.
import psutil 
num_cores = psutil.cpu_count(logical = False)
print("CPU cores found:", num_cores)
from p_tqdm import p_map

# Project-specific modules.
from keywords import find_keywords2
#from clinical_sectionizer import TextSectionizer, Sectionizer

## Import data

In [None]:
# Created in 02_notes-to-sentences.ipynb
sent_df = pd.read_feather("data/mimic-sentences-pysbd.feather")

# 29 MM sentences with note category exclusions and age >= 18.
print(sent_df.info())

In [None]:
399285 in sent_df['row_id'].values

## Keyword tagging

In [None]:
# TODO: move into a function.
# Load keywords (phrases)
all_keywords = []
neg_keywords = pd.read_csv('data-raw/negative_keywords.csv').iloc[::, 0]
pos_keywords = pd.read_csv('data-raw/positive_keywords.csv').iloc[::, 0]

all_keywords.append(neg_keywords.apply(lambda x: x.strip()).tolist())
all_keywords.append(pos_keywords.apply(lambda x: x.strip()).tolist())

# Convert to a dictionary temporarily to deduplicate keyword list ("concerning" is duplicated)
keywords = list(dict.fromkeys([i.lower() for i in all_keywords for i in i]))
negative_keywords=[i.strip().lower() for i in neg_keywords]
positive_keywords=[i.strip().lower() for i in pos_keywords]

In [None]:

from keywords import find_keywords2

def find_keywords_df(df, keywords):
    results = []
    for row in df.itertuples():
        found_dict = find_keywords2(row.text, keywords = keywords)
        
        # Convert dictionary with locations to just a list of found keywords/phrases.
        # We don't need the location right now.
        result = list(itertools.chain.from_iterable([[key_i] * len(locations) for key_i, locations in found_dict.items()]))

        # Join original df back onto result.
        #result2 = pd.join(row, result)
        results.append(result)
    combined = pd.DataFrame(pd.Series(results))
    combined.index = df.index
    return combined

find_keywords_df(sent_df[:30], keywords)

In [None]:
%%time
# Takes 55 mins with 24 cores on Benten.
# Takes 40 mins on ssbvape.

num_partitions = 2000
#num_partitions = 5000
#num_partitions = 10000

# This will be a list of dfs.
df_split = np.array_split(sent_df, num_partitions)


from functools import partial

# Apply our function to each dataframe chunk in the list.
# Use partial() to specify the keywords argument.
result = p_map(partial(find_keywords_df, keywords = keywords), df_split, num_cpus = num_cores)
combined_kw = pd.concat(result)

In [None]:
combined_kw.info()

In [None]:
combined_kw.head()

In [None]:
sent_df['text'].values[0]

In [None]:
print(combined_kw.sample(20, random_state = 1))

# Shoulud be the same size as sent_df (30.2 MM)
print("Combined kw length:", len(combined_kw))
sent_df['keywords'] = combined_kw[0].values

In [None]:
# TODO: calculate this before saving the above feather file.
sent_df['keyword_count'] = sent_df['keywords'].str.len()
sent_df.info()

In [None]:
399285 in sent_df['row_id'].values

In [None]:
%%time

sent_df.reset_index(drop = True).to_feather("data/mimic-sentences-pysbd-kw.feather")

In [None]:
# Confirm that it works (or load from scratch if skipping above cells.
sent_df = pd.read_feather("data/mimic-sentences-pysbd-kw.feather")

In [None]:
print(sent_df.info())
sent_df.head()

print(sent_df[130:150])

In [None]:
399285 in sent_df['row_id'].values


## Score: Stanza, Pattern

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Function needs to be defined in a file to be correctly exported
# to the p_map parallel works, otherwise it will give an error.
import sentiment # This refers to our sentiment.py module (file), not a pip package.
from pattern import en

# Calculate sentiment scores for multiple rows in a dataframe and return
# a dataframe of results with the same number of rows and index.
def score_sentences_df(df):
    results = []
    for row in df.itertuples():
        # 
        result = {
            # Stanza seems much closer than pattern, presumably because
            # it's a neural model (and we've disabled GPU currently).
            'sent_stanza': sentiment.sentiment_stanza(row.text),
            'sent_pattern': en.sentiment(row.text)[0]
        }

        results.append(result)
    combined = pd.DataFrame(results)
    combined.index = df.index
    return combined

In [None]:
print("Sentiment_stanza() test:",
      sentiment.sentiment_stanza(sent_df.sample(1, random_state = 2).text.values[0]))

print("Pattern test:",
      en.sentiment(sent_df.sample(1, random_state = 2).text.values[0])[0])

print("\nScore_sentences_df() test:\n",
    score_sentences_df(sent_df.sample(20, random_state = 2)))

In [None]:
%%time
# Takes 6 hours 17 mins with 24 cores - make sure to run single-threaded.

num_partitions = 10000

# This will be a list of dfs.
df_split = np.array_split(sent_df, num_partitions)

result = p_map(score_sentences_df, df_split, num_cpus = num_cores)

combined_df = pd.concat(result)

### Save result

In [None]:
print(combined_df.shape)
print(combined_df.head(20))
print(combined_df.corr())

combined_df.describe()

In [None]:
sent_df = sent_df.join(combined_df)
sent_df.reset_index(drop = True).to_feather("data/mimic-sentences-sentiment-prelim.feather")
sent_df.head()

In [None]:
399285 in sent_df['row_id'].values

## Score: DeBERTA-v3

### Setup

In [None]:
import warnings
import torch
#from tqdm import tqdm
from tqdm.auto import tqdm

# At least 1 gpu is needed for this to run reasonably quickly.
print("PyTorch version:", torch.__version__)
print("GPU available:", torch.cuda.is_available())
print("GPU device count:", torch.cuda.device_count())
print("CUDA version:", torch.version.cuda)
torch.cuda.empty_cache()

In [None]:
#del model, tokenizer, pipe
import gc
#del raw_dataset_sample
#del sent_df
gc.collect()

In [None]:
import transformers
print("Transformers version:", transformers.__version__)
import datasets
print("Datasets version:", datasets.__version__)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, pipeline, TextClassificationPipeline
from datasets import Dataset, load_from_disk
from transformers.pipelines.pt_utils import KeyDataset

# Created in deep-learning-model.ipynb
model_path = str(pyprojroot.here() / "models/deberta-v3")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to GPU.
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu");
print("GPU or CPU device:", device)
#model.to(device);

# num_labels appears to mean num_classes
# Need to put model on GPU for inference.
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           problem_type = "single_label_classification",
                                                           num_labels = 5).to(device)

In [None]:
# Copied from deep-learning-model.ipynb
class_names = ['Very Negative', 'Negative', 'Neutral', 'Positive', 'Very Positive']

warnings.filterwarnings("ignore", category = DeprecationWarning)

# Text should be a single string, not a vector currently.
def predict_sentiment(text, return_class = True, max_length = 512):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    #print(probs)
    # executing argmax function to get the candidate label
    if return_class:
        return class_names[probs.argmax()]
    else:
        return probs.argmax()

# Expected results: Neutral, Negative, Negative, Positive
print(predict_sentiment("This is a test sentence."))
print(predict_sentiment("I'm worried that the patient is doing poorly."))
print(predict_sentiment("I'm extremely worried that the patient is doing terribly and will certainly die soon."))
print(predict_sentiment("Patient's bp is normalizing, and kidney function appears to be improving."))

### Load sentences, convert to dataset

In [None]:
%%time

# Takes 18 seconds to load.
# CK: this was the wrong feather file to load it seems.
#sent_df = pd.read_feather("data/mimic-sentences-sentiment.feather")
sent_df = pd.read_feather("data/mimic-sentences-sentiment-prelim.feather")

# Exclude ['row_id', 'sent_num'] to avoid errors later on, which is unfortunate.
raw_dataset = Dataset.from_pandas(sent_df[['text']])
print("Raw dataset:\n", raw_dataset)

raw_dataset_sample = raw_dataset.from_pandas(sent_df[['text']].sample(1024).reset_index(drop = True))
print("Raw dataset sample:\n", raw_dataset_sample)

In [None]:
sent_df.info()

In [None]:
399285 in sent_df['row_id'].values
del sent_df

### Dataset tokenization (skip)

In [None]:
%%time
# Takes ~37 minutes with a single core or 8 minutes with 6 cores.

# Copied from deep-learning-model.ipynb and modified.
def tokenize(batch):
    tokens = tokenizer(batch['text'], truncation = True, padding = True, max_length = 256)
    #result = labels.str2int(batch['labels'])
    #tokens['labels'] = result
    return tokens

# Don't use num_cores - each process takes up too much RAM apparently.
# We lose the progress bar when parallelized though :/
tokenized_datasets = raw_dataset.map(tokenize, batched = True, num_proc = 6,
            # Remove any extra columns to avoid a warning when training, not essential though.
                                 remove_columns = raw_dataset.column_names)

tokenized_datasets.set_format('torch')
print(tokenized_datasets)
print(tokenized_datasets.features)

# This will create a subfolder rather than a single file.
tokenized_datasets.save_to_disk("data/mimic-sentences-tokenized")

#### Load tokenized dataset, create dataloaders (skip)

In [None]:
# Load tokenized dataset.
tokenized_datasets = load_from_disk("data/mimic-sentences-tokenized")

tokenized_dataloader = torch.utils.data.DataLoader(tokenized_datasets, batch_size = 8)

# Version of dataloader that does not include tokenization.
# Is there any usage of this though? To be determined.
raw_dataloader = torch.utils.data.DataLoader(raw_dataset, batch_size = 8)

### Inference

#### Pipeline + Keydataset v2 (works?)

In [None]:
%%time
# Takes approx 24 hours.

# device = 0 puts the pipeline on GPU, otherwise it will only use CPU.
pipe = pipeline("text-classification", model = model, tokenizer = tokenizer, device = 0, max_length = 256,
               truncation = True)

# Hide the large number of deprecation warnings.
warnings.filterwarnings("ignore", category = DeprecationWarning)

preds = []
# Progress bar is inaccurate for some reason :(
# Batch size of 512 is close to the max GPU RAM usage - led to an OOM error eventually.
# GPU RAM usage continues to grow through inference :( Something is not being deleted correctly.
#for i, outputs in enumerate(tqdm(pipe(KeyDataset(raw_dataset, "text"), batch_size = 32),
#    total = len(raw_dataset))):
for outputs in tqdm(pipe(KeyDataset(raw_dataset, "text"), batch_size = 128),
    total = len(raw_dataset)):
#for i, outputs in enumerate(tqdm(pipe(KeyDataset(raw_dataset_sample, "text"), batch_size = 128),
#    total = len(raw_dataset_sample))):
#for i, outputs in enumerate(pipe(KeyDataset(raw_dataset_sample, "text"), batch_size = 128)):
    #print(i)
#    preds.append(outputs['score'])
    preds.append(outputs['label'])




In [None]:
preds_df = pd.DataFrame(preds)
preds_df.rename(columns = {0: 'label'}, inplace = True)
preds_df.info()

In [None]:
print(preds_df.sample(10))
# 1 = Negative, 2 = Neutral, 3 = Positive
print(Counter(preds_df['label']))

preds_df['label'] = preds_df['label'].astype("category")
print(preds_df.label.cat.categories)
preds_df['label'] = preds_df.label.cat.rename_categories(['negative', 'neutral', 'positive'])
print(preds_df.label.cat.categories)

print(Counter(preds_df['label']))

In [None]:
# TODO: save preds, convert LABEL_2, LABEL_3, etc. to the numeric codes.
# Save note-level sentence average.
preds_df.to_feather('data/debertav3-sent.feather')

In [None]:
preds_df.describe()

In [None]:
sent_df['sent_deberta'] = preds_df.label.cat.codes

In [None]:
sent_df.describe()

In [None]:
sent_df[['sent_stanza', 'sent_pattern', 'sent_deberta']].corr()

In [None]:
sent_df.reset_index(drop = True).to_feather("data/mimic-sentences-sentiment.feather")

In [None]:
sent_df.info()

In [None]:
sent_df = pd.read_feather("data/mimic-sentences-sentiment.feather")
sent_df.info()

In [None]:
399285 in sent_df['row_id'].values

## Aggregate to note averages

In [None]:
sent_df2 = sent_df.groupby("row_id").agg(
    # Will need to add 1 to this column.
    sentences = pd.NamedAgg(column = "sent_num", aggfunc = "max"),
    chars = pd.NamedAgg(column = "chars", aggfunc = "sum"),
    words = pd.NamedAgg(column = "words", aggfunc = "sum"),
    sent_stanza = pd.NamedAgg(column = 'sent_stanza', aggfunc = 'mean'),
    sent_pattern = pd.NamedAgg(column = 'sent_pattern', aggfunc = 'mean'),
    sent_deberta = pd.NamedAgg(column = 'sent_deberta', aggfunc = 'mean'))

sent_df2['sentences'] = sent_df2['sentences'] + 1

In [None]:
sent_df2.head()

In [None]:
sent_df2[['sent_stanza', 'sent_pattern', 'sent_deberta']].corr()

In [None]:
sent_df2.reset_index().to_feather('data/mimic-notes-sentiment.feather')

In [None]:
#sent_df2 = pd.read_feather('data/mimic-notes-sentiment.feather')

In [None]:
#399285 in sent_df2['row_id'].values