<a href="https://colab.research.google.com/github/fastdatascience/clinical_trial_risk/blob/fixes_nov_2022/train/ctgov/TrainPhaseAndArmAndNumSubjectsClassifier_04_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import os
import sys
import pickle as pkl
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import json

# df_annotations = pd.read_csv("all_annotations.csv")


In [None]:
import time
time.sleep(60*25)

# Get data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_annotations= pd.read_csv("/content/drive/MyDrive/data/filtered_for_phase_arms_subjects_02.csv.bz2")

In [None]:
#df_annotations= pd.read_csv("filtered_for_phase_arms_subjects_02.csv.bz2")
# df_annotations= pd.read_csv("/home/thomas/Downloads/filtered_for_phase_arms_subjects.csv.bz2")

In [None]:
#df_annotations.text = df_annotations.text.apply(lambda t : t[:10000] if len(t) > 10000 else t)

In [None]:
def get_num_subjects_clean(num):
    if pd.isna(num):
        return None
    if num >= 10000:
        return "10000+"
    if num >= 1000:
        return "1000+"
    if num >= 500:
        return "500+"
    if num >= 200:
        return "200+"
    if num >= 100:
        return "100+"
    if num >= 50:
        return "50+"
    if num >= 25:
        return "25+"
    return "1-24"
df_annotations["num_subjects_clean"] = df_annotations["num_subjects"].apply(get_num_subjects_clean)

In [None]:
df_annotations["num_subjects_clean"].value_counts()

In [None]:
# df_annotations = pd.read_csv("filtered_for_phase.csv.bz2")

In [None]:
# del file_to_pages

In [None]:
phase_map = {"Phase 2":"2",
"Phase 3":"3",
"Phase 4":"4",
"Phase 1":"1",
"Phase 1/Phase 2":"1.5",
"Not Applicable":"0",
"Phase 2/Phase 3":"2.5",
"Early Phase 1":"0.5"}
df_annotations["phase_clean"] = df_annotations["phase"].apply(lambda x : phase_map.get(x, None))

In [None]:
phase_clean_map = {}
for idx, val in enumerate(sorted(set(phase_map.values()), key = lambda x : float(x))):
  phase_clean_map[val] = idx
# invert the dictionary
phase_lookup = {v: k for k, v in phase_clean_map.items()}

phase_list = [phase_lookup[x] for x in sorted(phase_lookup)]
phase_list

In [None]:
def get_num_arms_clean(num):
    if pd.isna(num):
        return None
    if num >= 5:
        num = 5
    return num
df_annotations["num_arms_clean"] = df_annotations["num_arms"].apply(get_num_arms_clean)

# Begin Spacy

In [None]:
num_subjects_clean_map = {}
for idx, val in enumerate(sorted(set(df_annotations[~df_annotations.num_subjects_clean.isna()].num_subjects_clean), key = lambda x : int(re.sub(r'\D.*$', '', x)))):
  num_subjects_clean_map[val] = idx

In [None]:
# invert the dictionary
num_subjects_lookup = {v: k for k, v in num_subjects_clean_map.items()}

In [None]:
num_subjects_list = [num_subjects_lookup[x] for x in sorted(num_subjects_lookup)]
num_subjects_list

In [None]:
num_subjects_clean_map

In [None]:
def get_one_hot_num_subjects(x):
  a = [0] * len(num_subjects_clean_map)
  if x is None:
    return a
  a[num_subjects_clean_map[x]] = 1
  return a

In [None]:
df_annotations["num_subjects_one_hot"] = df_annotations["num_subjects_clean"].apply(get_one_hot_num_subjects)

In [None]:
df_annotations["num_subjects_one_hot"] = df_annotations["num_subjects_clean"].apply(get_one_hot_num_subjects)

In [None]:
def get_one_hot_num_arms(x):
  a = [0] * 5
  if x is not None and not pd.isna(x):
    a[int(x - 1)] = 1
  return a
df_annotations["num_arms_one_hot"] = df_annotations["num_arms_clean"].apply(get_one_hot_num_arms)

In [None]:
def get_one_hot_phase(x):
  a = [0] * len(phase_clean_map)
  if x is None:
    return a
  a[phase_clean_map[x]] = 1
  return a
df_annotations["phase_one_hot"] = df_annotations["phase_clean"].apply(get_one_hot_phase)

Concatenate the three bits of one-hot data into one column

In [None]:
concatenated_one_hot = []
for i in range(len(df_annotations)):
  concatenated = list(df_annotations.phase_one_hot.iloc[i]) + \
  list(df_annotations.num_arms_one_hot.iloc[i]) + \
  list(df_annotations.num_subjects_one_hot.iloc[i]) + [df_annotations.has_sap.iloc[i]]
  concatenated_one_hot.append(concatenated)
df_annotations["concatenated_one_hot"] = concatenated_one_hot

In [None]:
np.sum(np.asarray([np.asarray(x) for x in df_annotations["concatenated_one_hot"]]), axis=1).mean()

In [None]:
np.sum(np.asarray([np.asarray(x) for x in df_annotations["concatenated_one_hot"]]), axis=0)

In [None]:
num_classes = len(concatenated)
print (f"There are {num_classes} classes in this multi-label classifier")

In [None]:
df_train = df_annotations[df_annotations.train_val == "train"]
df_val = df_annotations[df_annotations.train_val == "val"]

In [None]:
df_train_got_some_ground_truths = df_train[~df_train.num_subjects_clean.isna() | ~df_train.num_arms_clean.isna() | ~df_train.phase_clean.isna()]

In [None]:
len(df_train_got_some_ground_truths), len(df_train)

# Begin Spacy

In [None]:
# TRAINING_DATA = [
#     ["My little kitty is so special", {"KAT0": True}],
#     ["Dude, Totally, Yeah, Video Games", {"KAT1": True}],
#     ["Should I pay $1,000 for the iPhone X?", {"KAT1": True}],
#     ["The iPhone 8 reviews are here", {"KAT1": True}],
#     ["Noa is a great cat name.", {"KAT0": True}],
#     ["We got a new kitten!", {"KAT0": True}]
# ]

TRAINING_DATA = []
for idx in range(len(df_train)):
    cats = {}
    for a in range(num_classes):
        cats[str(a)] = df_annotations["concatenated_one_hot"].iloc[idx][a]

    text = df_train.text.iloc[idx]
    if len(text) > 1000000:
        text = text[:1000000]
    
    TRAINING_DATA.append([text , cats])

In [None]:
import spacy
# Add imports for example, as well as textcat config...
from spacy.training import Example
from spacy.pipeline.textcat_multilabel import multi_label_default_config
from thinc.api import Config
import random

# labels should be one-hot encoded



# bow
# config = Config().from_str(single_label_bow_config)

# textensemble with attention
config = Config().from_str(multi_label_default_config)

nlp = spacy.blank("en")
# now uses `add_pipe` instead
category = nlp.add_pipe("textcat_multilabel", last=True, config=config)
for a in range(num_classes):
    category.add_label(str(a))
# Start the training
nlp.begin_training()

# Loop for 10 iterations
for itn in range(100):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=4):
        texts = [nlp.make_doc(text) for text, entities in batch]
        annotations = [{"cats": entities} for text, entities in batch]

        # uses an example object rather than text/annotation tuple
        examples = [Example.from_dict(doc, annotation) for doc, annotation in zip(
            texts, annotations
        )]
        nlp.update(examples, losses=losses)
    if itn % 20 == 0:
        print(losses)


In [None]:
predictions = []
for idx in range(len(df_val)):
    doc = nlp(df_val.text.apply(str).iloc[idx])
    predictions.append(doc.cats)

In [None]:
pred_proba = []
for idx in range(len(df_val)):
    pred_proba.append([predictions[idx][a] for a in range(len(num_classes))])

In [None]:
y_pred_phase = []
y_pred_num_arms = []
y_pred_num_subjects = []
y_pred_sap = []
for idx in range(len(pred_proba)):
  probas_this_instance = pred_proba[idx]
  probas_phase = probas_this_instance[:len(phase_lookup)]
  y_pred_phase.append(phase_lookup[int(np.argmax(probas_phase))])
  probas_arms = probas_this_instance[len(phase_lookup):len(phase_lookup)+5]
  y_pred_num_arms.append(1 + int(np.argmax(probas_arms)))
  probas_subjects = probas_this_instance[len(phase_lookup)+5:-1]
  y_pred_num_subjects.append(num_subjects_lookup[int(np.argmax(probas_subjects))])
  probas_sap = probas_this_instance[-1:]
  y_pred_sap.append(probas_sap[0] > 0.5)

df_val["y_pred_phase"] = y_pred_phase
df_val["y_pred_num_arms"] = y_pred_num_arms
df_val["y_pred_num_subjects"] = y_pred_num_subjects
df_val["y_pred_sap"] = y_pred_sap

## Phase

In [None]:
acc = accuracy_score(df_val.phase_clean.apply(str), df_val["y_pred_phase"])
print (f"Phase accuracy {acc}")

In [None]:
ConfusionMatrixDisplay.from_predictions(df_val.phase_clean.apply(str), df_val["y_pred_phase"])
plt.xticks(rotation=90)
;

# Number of arms

In [None]:
acc = accuracy_score(df_val.num_arms_clean.apply(float).apply(str), df_val["y_pred_num_arms"].apply(float).apply(str))
print (f"Num arms accuracy {acc}")

In [None]:
ConfusionMatrixDisplay.from_predictions(df_val.num_arms_clean.apply(float).apply(str), df_val["y_pred_num_arms"].apply(float).apply(str))
plt.xticks(rotation=90)
;

## Subjects

In [None]:
acc = accuracy_score(df_val.num_subjects_clean, df_val["y_pred_num_subjects"])
print (f"Subjects accuracy {acc}")

In [None]:
ConfusionMatrixDisplay.from_predictions(df_val.num_subjects_clean, df_val["y_pred_num_subjects"], labels=num_subjects_list)
plt.xticks(rotation=90)
;

In [None]:
num_correct = 0
for idx in range(len(df_val)):
  gt = num_subjects_clean_map[df_val["num_subjects_clean"].iloc[idx]]
  pred = num_subjects_clean_map[df_val["y_pred_num_subjects"].iloc[idx]]
  is_correct = int(np.abs(gt - pred) <= 1)
  num_correct += is_correct
print ("Accuracy including adjacent groups", num_correct/len(df_val))

## SAP

In [None]:
acc = accuracy_score(df_val.has_sap, df_val["y_pred_sap"])
print (f"SAP accuracy {acc}")

In [None]:
ConfusionMatrixDisplay.from_predictions(df_val.has_sap, df_val["y_pred_sap"])
plt.xticks(rotation=90)
;