<a href="https://colab.research.google.com/github/fastdatascience/clinical_trial_risk/blob/fixes_nov_2022/train/ctgov/PrepareDataForPhaseSampleSizeAndNumArms_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import os
import sys
import pickle as pkl
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import json

# df_annotations = pd.read_csv("all_annotations.csv")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_annotations = pd.read_csv("/content/drive/MyDrive/data/all_annotations.csv")

In [None]:
!python --version

Python 3.7.15


# Get data

In [None]:
with open("/content/drive/MyDrive/data/protocols.pkl.gz", "rb") as f:
    file_to_pages = pkl.load(f)

In [None]:
import spacy
nlp = spacy.blank("en")

In [None]:
import operator
sorted(ctr.items(), key=operator.itemgetter(1))

[('simulates', 1),
 ('simulating', 2),
 ('simulate', 2),
 ('pairs', 2),
 ('enrols', 2),
 ('infected', 3),
 ('armed', 3),
 ('recruits', 6),
 ('accruing', 7),
 ('persons', 10),
 ('accrue', 25),
 ('sampled', 28),
 ('individuals', 30),
 ('simulations', 30),
 ('pts', 31),
 ('males', 32),
 ('simulated', 32),
 ('powered', 43),
 ('people', 46),
 ('recruiting', 48),
 ('enrol', 50),
 ('men', 53),
 ('accrued', 57),
 ('simulation', 58),
 ('scenarios', 59),
 ('females', 61),
 ('recruit', 67),
 ('phases', 92),
 ('female', 125),
 ('healthy', 138),
 ('cases', 144),
 ('enrolment', 145),
 ('enrolling', 153),
 ('recruited', 155),
 ('male', 168),
 ('women', 198),
 ('participants', 214),
 ('n', 294),
 ('enroll', 319),
 ('select', 360),
 ('achieve', 434),
 ('accrual', 445),
 ('cohorts', 464),
 ('recruitment', 596),
 ('sampling', 623),
 ('arms', 657),
 ('selection', 715),
 ('power', 767),
 ('groups', 1251),
 ('cohort', 1421),
 ('enrolled', 1851),
 ('subjects', 2011),
 ('target', 2072),
 ('overall', 2127),
 (

In [None]:
word2num ={'one': 1,
 'two': 2,
 'three': 3,
 'four': 4,
 'five': 5,
 'six': 6,
 'seven': 7,
 'eight': 8,
 'nine': 9,
 'ten': 10,
 'eleven': 11,
 'twelve': 12,
 'thirteen': 13,
 'fourteen': 14,
 'fifteen': 15,
 'sixteen': 16,
 'seventeen': 17,
 'eighteen': 18,
 'nineteen': 19,
 'both': 2,
 'single': 2,
 'twenty': 20,
 'thirty': 30,
 'forty': 40,
 'fifty': 50,
 'sixty': 60,
 'seventy': 70,
 'eighty': 80,
 'ninety': 90,
 'hundred': 100,
 'thousand': 1000}

In [None]:
from collections import Counter
ctr = Counter()

texts = []

for i in range(len(df_annotations)):
    file_name = df_annotations.file.iloc[i]
    
    pages= file_to_pages[file_name]
    
    text = ""
    for page in pages:
        doc = nlp(page)
        
        is_include = [False] * len(doc)
        for tok in doc:
            next_tok = None
            if tok.i < len(doc) - 1:
              next_tok = doc[tok.i + 1]
            prev_tok = None
            if tok.i > 0:
              prev_tok = doc[tok.i - 1]
            antepenultimate_tok = None
            if tok.i > 1:
              antepenultimate_tok = doc[tok.i - 2]

            if tok.text.lower() in {"phase","phases", "arm", "arms", "armed", "cohort", "cohorts", "group", "groups",
                                   "sample",  "sampling", "sampled", "samples", "enroll", "enrol", "enrols", "enrolled", "enrolling", "enrolment",
                                    "recruit", "recruiting", "recruited", "recruits", "recruitment",
                                    "target", "accrual", "accruing", "accrue", "accrued",
                                    "power", "powered", "pts",
                                    "simulate", "simulates", "simulated", "simulating", "simulation", "simulations",
                                    "scenarios",
                                    "n",
                                    "overall", "total", 
                                    "participants", "subjects", "people", "persons", "healthy", "infected",
                                    "pairs", "individuals", "women", "men", "patients", "males", "females", "male", "female",
                                    "select", "selection", "approximately", "achieve",
                                    "cases"
                                   }:
                to_include = True
                if tok.text.lower() == "n" and next_tok is not None and next_tok.text not in {"=", ">", "<", "≥"}:
                  to_include = False
                if tok.i > 1 and tok.text.lower() in {"participants", "subjects", "people", "persons", "healthy", "infected",
                                    "pairs", "individuals", "women", "men", "patients", "males", "females", "male", "female",
                                    "cases", "pts"} and not (prev_tok.like_num or antepenultimate_tok.like_num or prev_tok.text.lower() in word2num or antepenultimate_tok.text.lower() in word2num):
                  to_include = False
                
                if to_include:
                  ctr[tok.text.lower()] += 1
                  for token_index in range(tok.i - 15, tok.i + 15):
                      if token_index >= 0 and token_index < len(doc):
                          is_include[token_index] = True
        
        for token_index in range(len(doc)):
            if is_include[token_index]:
                text += doc[token_index].text + doc[token_index].whitespace_
                
    if text == "":
        print ("nothing found", file_name, df_annotations.phase.iloc[i])
        text = " ".join(pages)
        if len(text) > 100000:
            text = text[:100000]
    else:
        print ("found", file_name, df_annotations.phase.iloc[i], len(text))

    texts.append(text)
df_annotations["text"] = texts

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
found 58_NCT00072358_Prot_SAP_000.pdf Phase 2 14708
found 58_NCT00104858_Prot_SAP_000.pdf Phase 2 26885
found 58_NCT00367458_Prot_SAP_000.pdf Phase 2 12454
found 58_NCT00656058_Prot_ICF_000.pdf Phase 2 1087
found 58_NCT00936858_Prot_SAP_000.pdf Phase 2 23251
found 58_NCT01086358_Prot_SAP_000.pdf Phase 4 4125
found 58_NCT01111058_Prot_SAP_000.pdf Phase 2 19353
found 58_NCT01198158_Prot_SAP_000.pdf Phase 3 39983
found 58_NCT01306058_Prot_SAP_ICF_000.pdf Phase 1/Phase 2 57494
found 58_NCT01386658_Prot_004.pdf Phase 3 36590
found 58_NCT01597258_Prot_000.pdf nan 3514
found 58_NCT01650558_Prot_SAP_000.pdf Not Applicable 26147
found 58_NCT01701258_Prot_SAP_000.pdf Phase 1 20437
found 58_NCT01702558_Prot_000.pdf Phase 2 443265
found 58_NCT01706458_Prot_SAP_000.pdf Phase 2 34295
found 58_NCT01711658_Prot_SAP_000.pdf Phase 2 30191
found 58_NCT01740258_Prot_SAP_000.pdf Phase 2 19876
found 58_NCT01768858_Prot_SAP_000.pdf nan 7682
fou

In [None]:
len(df_annotations.text.iloc[0])

51228

In [None]:
df_annotations.to_csv("filtered_for_phase_arms_subjects_02.csv.bz2")

In [None]:
df_annotations.to_csv("/content/drive/MyDrive/data/filtered_for_phase_arms_subjects_02.csv.bz2")