# Clinical Study categorization with scispacy.

Import the things.

In [264]:
import csv
import multiprocessing
import pprint
import re
import spacy
import scispacy
import sys

from collections import defaultdict, Counter
from stopwords import STOPWORDS
from tqdm import tqdm_notebook
from multiprocessing import Pool

from scispacy.abbreviation import AbbreviationDetector
from scispacy.umls_linking import UmlsEntityLinker


Set up scispacy with additional stopwords, abbreviation detection and entity detection.

In [227]:
nlp = spacy.load("en_core_sci_lg")

nlp.Defaults.stop_words |= STOPWORDS

abbreviation_pipe = AbbreviationDetector(nlp)
nlp.add_pipe(abbreviation_pipe)

linker = UmlsEntityLinker(resolve_abbreviations=True,
                         k=10,
                         max_entities_per_mention=3)
nlp.add_pipe(linker)

Ensure hashability for faster lookups.

In [249]:
stopwords = dict()
for word in nlp.Defaults.stop_words:
    stopwords[word] = ''

Define import functions.

In [250]:
def replace_csv_missing(row):
    """Do basic text cleaning from csv rows."""
    text = (row[0] + ' ' + 
            row[1] + ' ' + 
            row[8] + ' ' + 
            ', '.join(row[2].split()) + ', ' + 
            ', '.join(row[3].split()) + ', ' + 
            ', '.join(row[6].split()) + ', ' + 
            ', '.join(row[7].split())
           )
    new = re.sub('missing', '', text)
    new = re.sub('-', ' ', new)
    new = re.sub('\(\S*\)', ' ', new)
    new = re.sub(' \s+', ' ', new)
    new = re.sub(',,', ',', new)
    return u"{}".format(new)

In [251]:
def import_csv_files(file):
    all_rows = []
    with open(file, 'r') as csvinfile:
        csv_reader = csv.reader(csvinfile, delimiter=',')
#         for row in csv_reader:
#             all_rows.append(row)
        # Size limit for development.
        for i, row in enumerate(csv_reader):
            if i > 1000:
                break
            all_rows.append(row)
    pool = Pool(processes=multiprocessing.cpu_count())
    with pool as p:
        transformed_rows = p.map(replace_csv_missing, all_rows)
    return transformed_rows
    

Import data.

In [252]:
search_terms = import_csv_files('../all_trials_text.csv')[1:]  # Discard header row.

In [253]:
len(search_terms)

1000

## We'll need a place to store our index for fast lookup. [This](https://aws.amazon.com/blogs/aws/amazon-dynamodb-internet-scale-data-storage-the-nosql-way/) looks promising.

below is a json like idea that we should be able to upload to a db easily.

In [254]:
def make_entry(entry: str, index: int, dictionary: defaultdict) -> None:
    """Make an index entry with look-up terms as keys, tuples of document indexes and
    relevancy score as values.
    
    :var entry: text from which terms are to be extracted.
    :var index: index of text
    :var dictionary: dictionary to update with new terms and indexes.
    """
    doc = nlp(entry)
    count = Counter()
    for text in doc.ents:
        if text.lemma_.lower() not in stopwords:
            count[text.lemma_.lower()] += 1
    total = sum(count.values())
    for term, relevancy in count.items():
        dictionary[term].update({index: round(relevancy / total, 4)})
    return

In [265]:
def create_index(corpus: list) -> defaultdict:
    """Create a dict index of look-up terms.
    
    Look-up terms as keys, tuples of document indexes and
    relevancy scores as values.
    
    :var corpus: list of list of strings from which to extract look-up terms.
    
    :returns dictionary: dict of dicts "look-up term": {index: relevancy score},
    """
    dictionary = defaultdict(dict)
    for i, text in tqdm_notebook(enumerate(corpus)):
        make_entry(text, i, dictionary)
    return dictionary

In [None]:
index = create_index(search_terms)
len(index)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [None]:
# Filter for display purposes. Just the entries ending in 'a'.
filtered_index = dict(filter(lambda item: item[0].endswith('a'), index.items()))
print(len(filtered_index))
pprint.pprint(filtered_index)