# 1. Split Dr-Edited Question Text into its Constituent Parts

These are defined as:
1. Empathetic Response
2. Question
3. Extra info

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, RegexpTokenizer
import nltk
import json
from IPython.display import display
import re
from fuzzywuzzy import fuzz

pd.set_option('display.max_colwidth', 500)

edits_df = pd.read_csv("output/edits.csv")
edits_df

## Helper Functions

In [None]:
empathy_splitters = "-,.;:!"
question_splitters = "?"

def create_question_splits(line):
    """
    Filter out empty items and strip of any trailing whitespace after splitting on sentence splits
    """
    regex_splitter = f"([{empathy_splitters}{question_splitters}])"
    split = list(filter(None, [x.strip() for x in re.split(regex_splitter, line)]))
    split = [x for x in split if len(x) > 1]
    return split

def split_into_columns(x):
    return x["empathy"], x["question"]

def select_question(candidates, orig_question):
    scores = [fuzz.ratio(c, orig_question) for c in candidates]
    return np.argmax(scores)

def get_scores(row):
    split = create_question_splits(row["question_text"])

    return [fuzz.ratio(c, row["default_question_text"]) for c in split]

def parse_question_text(row):
    text = row["question_text"]
    orig_text = row["default_question_text"]

    split = create_question_splits(text)

    question_start_idx = select_question(split, orig_text)

    pred_empathy = ". ".join(split[:question_start_idx])
    pred_question = ", ".join(split[question_start_idx:])

    return {
        "empathy": pred_empathy if pred_empathy else None,
        "question": pred_question,
    }

## Apply Empathy Extraction

In [None]:
test_df = edits_df
# test_df = edits_df[edits_df["question_text"] == "Where is your belly pain? Do you have pain in your lower abdomen (lower belly)?"].head(1)

test_df["question_parts"] = test_df.apply(parse_question_text, axis=1)
test_df["scores"] = test_df.apply(get_scores, axis=1)
test_df["empathy"], test_df["question"] = zip(*test_df["question_parts"].map(split_into_columns))
test_df = test_df.drop("question_parts", axis=1)

print("Proportion empathetic responses:", test_df["empathy"].count() / len(test_df))
test_df

## Apply Empathy Response labels to Edits Dataset

In [None]:
# Each empathy response has been manually labelled
empathy_labels = pd.read_csv("empathy_counts_labelled.csv")

# Map the extracted empathy back to the class it's been labelled as.
def apply_label(empathy):
    if empathy is None:
        return 0
    corr_class = empathy_labels[empathy_labels["Empathy Utterance"] == empathy]["Class"]
    if len(corr_class.values) == 0:
        print("Can't find label for empathy:", empathy)
        return -1

    return corr_class.values[0]

test_df["empathy_label"] = test_df["empathy"].map(apply_label).astype(int)
test_df