In [1]:
import pandas as pd
import re
import os
import sys
import pickle as pkl
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import json

df_annotations = pd.read_csv("../../data/ctgov/annotations/all_annotations.csv")


# Get data

In [2]:
with open("../../data/ctgov/protocols.pkl.gz", "rb") as f:
    file_to_pages = pkl.load(f)

In [3]:
texts = []

for i in range(len(df_annotations)):
    file_name = df_annotations.file.iloc[i]
    
    pages= file_to_pages[file_name]

    texts.append(" ".join(pages))
df_annotations["text"] = texts

In [4]:
del file_to_pages

In [5]:
df_annotations_num_arms_known = df_annotations[~df_annotations.num_arms.isna()]

In [6]:
df_annotations_num_arms_known = df_annotations_num_arms_known.sample(1000)

In [7]:
def get_num_arms_clean(num):
    if num >= 5:
        num = 5
    return num
df_annotations_num_arms_known["num_arms_clean"] = df_annotations_num_arms_known["num_arms"].apply(get_num_arms_clean)

# Train and evaluate the number of arms extractor

In [8]:
sys.path.append("../../front_end")
from util.page_tokeniser import tokenise_pages, tokenise_text, tokenise_text_and_lowercase

In [9]:
import spacy
from spacy.matcher import Matcher
import spacy

nlp = spacy.load('en_core_web_sm', disable=['ner', 'tagger', 'parser', 'lemmatizer'])



In [10]:
word2num = {'one': 1,
 'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'seven': 7,
 'eight': 8,
 'nine': 9,
 'ten': 10,
 'eleven': 11,
 'twelve': 12,
 'thirteen': 13,
 'fourteen': 14,
 'fifteen': 15,
 'sixteen': 16,
 'seventeen': 17,
 'eighteen': 18,
 'nineteen': 19,
 'both': 2,
 'single': 2}

In [11]:
number_words = list(word2num)

In [12]:
for n in range(1, 20):
    word2num[str(n)]= n

In [13]:
numbers_above_3 = list([w for w in word2num if word2num[w] >= 3])

In [14]:
matcher = Matcher(nlp.vocab)
patterns = [ #[{"LIKE_NUM":True},  {"LOWER": {"IN": ["treatment", "study", "dose"]}, "OP":"?"}, {"LOWER": {"IN": ["arm", "arms", "group", "groups", "subgroup", "subgroups", "cohort", "cohorts"]}}],
           [{"LOWER":{"IN":number_words}},  {"LOWER": {"IN": ["treatment", "study", "dose"]}}, {"LOWER": {"IN": ["arm", "arms", "group", "groups", "subgroup", "subgroups", "cohort", "cohorts"]}}],
#            [{"LOWER":{"IN":number_words}},  {"LOWER": {"IN": ["group", "groups", "subgroup", "subgroups", "cohort", "cohorts"]}}],
             [{"LOWER":{"IN":list(word2num)}},{"LOWER":"-", "OP":"?"}, {"LOWER": {"IN": [ "armed"]}}]
]
matcher.add("arms", patterns)

In [15]:
doc = spacy.tokens.doc.Doc(
            nlp.vocab, words=["5", "arms"])
matcher(doc)

[]

In [16]:
#tokenised_pages = [["5", "arms"]]
# if True:
def process(self, tokenised_pages: list) -> tuple:
    """
    Identify the trial phase.
    :param tokenised_pages: List of lists of tokens of each page.
    :return: The prediction (str) and a map from phase to the pages it's mentioned in.
    """

    tokenised_pages = [[string.lower() for string in sublist] for sublist in tokenised_pages]

    phase_to_pages = {}

    for page_number, page_tokens in enumerate(tokenised_pages):
        doc = spacy.tokens.doc.Doc(
            nlp.vocab, words=page_tokens)
        matches = matcher(doc)
        for word, start, end in matches:
            phase_number = doc[start:end]
            if phase_number not in phase_to_pages:
                phase_to_pages[phase_number] = []
            phase_to_pages[phase_number].append(page_number)

    phase_to_pages = sorted(phase_to_pages.items(), key=lambda v: len(v[1]), reverse=True)

    prediction = 0
    if len(phase_to_pages) == 1:
        for word in phase_to_pages[0][0]:
            if word.text in word2num:
                prediction = word2num[word.text]
                break
    if prediction > 5:
        prediction = 5
                
    phase_to_pages = [(phrase.text, value) for phrase, value in phase_to_pages]

    return {"prediction": prediction, "pages": dict(phase_to_pages)}

In [17]:
df_val = df_annotations_num_arms_known[df_annotations_num_arms_known.train_val == "val"]

In [18]:
df_train = df_annotations_num_arms_known[df_annotations_num_arms_known.train_val == "train"]

In [19]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, auc, roc_curve
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.pipeline import make_pipeline

In [None]:
NUM_FEATURES = 1500

stops = set(stopwords.words('english')).union(set(stopwords.words('french')))
stops.remove("both")
stops.remove("all")

# Specially engineered regex to include 95%, 95%ci, etc
vectoriser = CountVectorizer(lowercase=True,
                             token_pattern=r'[a-z][a-z]+', 
                            
                            vocabulary={
                                "arm", "group", "subgroup", "cohort", "arms", "groups", "subgroups", "cohorts",
                    "randomise", "randomize", "randomisation", "randomization", "randomised", "randomized",
                    "placebo", "unblinded", "unblinding", "blinded", "blinding", "blind", "compare", "double",
                    "controlled", "control", "differences", "vs", "outcomes", "hypothesis", "experimental", "compared",
                    "effects", "variables", "variables", "ratio", "versus", "outcome", "monotherapy", "polytherapy", "proprietary",
                    "criterion", "healthy", "remission", "separately", "separate", "separated", "assay", "dosing", "dose", "doses",
                    "treatment", "treatments", "study", "studies", "either", "both"}
                            )
transformer = TfidfTransformer()

nb = MultinomialNB()
nb = ComplementNB()
model = make_pipeline(vectoriser, transformer, nb)

In [None]:
df_train.num_arms_clean.value_counts()

In [None]:
model.fit(df_train.text, df_train.num_arms_clean)

In [None]:
df_val["pred_num_arms_nb"]  = model.predict(df_val.text)

In [None]:
df_val.pred_num_arms_nb.value_counts()

In [None]:
def get_num_arms(text):
    toks = tokenise_text_and_lowercase(text)
    result = process(None, [toks])
    return result["prediction"]

In [None]:
def get_pages(text):
    toks = tokenise_text_and_lowercase(text)
    result = process(None, [toks])
    return result["pages"]

In [None]:
df_val["pred_num_arms_spacy"] = df_val.text.apply(get_num_arms)

In [None]:
df_val["pred_num_arms_spacy"].value_counts()

In [None]:
pred_num_arms = []
for i in range(len(df_val)):
    if df_val.pred_num_arms_spacy.iloc[i] > 0:
        pred = df_val.pred_num_arms_spacy.iloc[i]
    else:
        pred = df_val.pred_num_arms_nb.iloc[i]
    pred_num_arms.append(pred)
df_val["pred_num_arms"] = pred_num_arms
pred_num_arms= df_val["pred_num_arms"]

In [None]:
df_val["pages"] = df_val.text.apply(get_pages)

In [None]:
pred_num_arms

In [None]:
pred_num_arms.value_counts()

In [None]:
num_arms_ground_truths = df_val.num_arms_clean

In [None]:
acc = accuracy_score(num_arms_ground_truths, pred_num_arms)
print (f"Num arms accuracy {acc}")

In [None]:
ConfusionMatrixDisplay.from_predictions(num_arms_ground_truths, pred_num_arms)
plt.xticks(rotation=90)
;

In [None]:
accuracy_score(num_arms_ground_truths[pred_num_arms > 0], pred_num_arms[pred_num_arms > 0])

In [None]:
df_val

In [None]:
fake_document = " ".join(vectoriser.vocabulary_)
vectorised_document = vectoriser.transform([fake_document])
transformed_document = transformer.transform(vectorised_document)
probas = np.zeros((transformed_document.shape[1]))

for prediction_idx in range(5):
    print(f"Strongest predictors for class {prediction_idx}\n")
    for i in range(transformed_document.shape[1]):
        zeros = np.zeros(transformed_document.shape)
        zeros[0, i] = transformed_document[0, i]
        proba = nb.predict_log_proba(zeros)
        probas[i] = proba[0, prediction_idx]

    for ctr, j in enumerate(np.argsort(-probas)):
        for w, i in vectoriser.vocabulary_.items():
            if i == j:
                print(f"{ctr}\t{w}")