In [43]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz

Defaulting to user installation because normal site-packages is not writeable
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz (14.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.8/14.8 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting spacy<3.8.0,>=3.7.4 (from en_core_sci_sm==0.5.4)
  Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.4->en_core_sci_sm==0.5.4)
  Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manyli

In [41]:
# NLP + Data analysis tools
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import spacy
import negspacy
from negspacy.negation import Negex

# helper functions

In [14]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n+', ' ', text)  
    text = re.sub(r'_+', ' ', text)   
    text = re.sub(r'\s+', ' ', text)  
    return text.strip()

In [23]:
def prepare_text(text):
    new_text = text[140:] #Remove generic beginning
    new_text = re.sub("___","", new_text) #Remove deidentified info
    return new_text

In [None]:
def tag_negations(text):
    doc = nlp(text)
    tagged_tokens = []
    for token in doc:
        if token._.negex:
            tagged_tokens.append(f"NEG_{token.text.lower()}")
        else:
            tagged_tokens.append(token.text.lower())
    return " ".join(tagged_tokens)


In [28]:
def normalize_concepts(text):
    doc = nlp(text)
    normalized_tokens = []
    for token in doc:
        if token.ent_type_:
            if token._.umls_ents:
                cui = token._.umls_ents[0][0]
                canonical = linker.umls.cui_to_entity[cui].canonical_name.replace(" ", "_")
                normalized_tokens.append(canonical.lower())
            else:
                normalized_tokens.append(token.text.lower())
        else:
            normalized_tokens.append(token.text.lower())
    return " ".join(normalized_tokens)

## explore notes

In [19]:
osa = pd.read_csv('/global/cfs/cdirs/m1532/Projects_MVP/Anthony/sorted_vocab/OSA_HF_BLUEBERT_VOCAB/osa_bluebert_bigrams.csv')
hf = pd.read_csv('/global/cfs/cdirs/m1532/Projects_MVP/Anthony/sorted_vocab/OSA_HF_BLUEBERT_VOCAB/hf_bluebert_bigrams.csv')
cohort = pd.read_csv('/global/cfs/projectdirs/m1532/Projects_MVP/_members/itunuad/Cohort/cohort_osa_hf_notes.csv')

In [4]:
chort.head(10)
print(cohort.shape)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...
5,10000117-DS-21,10000117,22927623,DS,21,2181-11-15 00:00:00,2181-11-15 15:04:00,\nName: ___ Unit No: ___\n...
6,10000117-DS-22,10000117,27988844,DS,22,2183-09-21 00:00:00,2183-09-29 16:23:00,\nName: ___ Unit No: ___\n...
7,10000248-DS-10,10000248,20600184,DS,10,2192-11-30 00:00:00,2192-11-30 19:49:00,\nName: ___ Unit No: ...
8,10000560-DS-15,10000560,28979390,DS,15,2189-10-17 00:00:00,2189-10-17 13:47:00,\nName: ___ Unit No: _...
9,10000764-DS-11,10000764,27897940,DS,11,2132-10-19 00:00:00,2132-10-19 18:50:00,\nName: ___ Unit No: ___\n \...


In [24]:
cohort['new_text'] = cohort['text'].apply(prepare_text)
cohort.drop(['text'], axis=1, inplace=True)
print(cohort.head(15))

    Unnamed: 0  subject_id   hadm_id   osa     hf  group  \
0            0    13340824  26708815  True  False      0   
1            1    10002013  24848509  True  False      0   
2            2    10002167  29383904  True  False      0   
3            3    10002221  21008195  True  False      0   
4            4    10003019  20030125  True  False      0   
5            5    10003019  20277210  True  False      0   
6            6    10003019  20962108  True  False      0   
7            7    10003019  21223482  True  False      0   
8            8    10003019  21616816  True  False      0   
9            9    10003019  22774359  True  False      0   
10          10    10003019  24646702  True  False      0   
11          11    10003019  25179393  True  False      0   
12          12    10003019  25573783  True  False      0   
13          13    10003019  27683372  True  False      0   
14          14    10004401  21085166  True   True      1   

                                       

In [25]:
cohort['char_count'] = cohort['new_text'].str.len()
cohort['word_count'] = cohort['new_text'].str.split().str.len()

# Summary stats by group
print(cohort.groupby('group')[['char_count', 'word_count']].describe())


      char_count                                                       \
           count          mean          std    min       25%      50%   
group                                                                   
0        21929.0  10394.366410  4158.424611  649.0   7526.00   9703.0   
1         7963.0  13118.335175  4597.234016  830.0  10017.00  12444.0   
2        41516.0  12444.380986  4484.731146  443.0   9418.75  11838.0   

                        word_count                                          \
           75%      max      count         mean         std    min     25%   
group                                                                        
0      12548.0  46736.0    21929.0  1568.343518  617.771712   82.0  1139.0   
1      15426.0  58729.0     7963.0  1986.278915  677.129739  115.0  1526.0   
2      14694.0  53534.0    41516.0  1880.109476  659.970981   55.0  1435.0   

                               
          50%     75%     max  
group                       

In [30]:
# testing 3 approaches (UMLS linker / Negation Handling / Information Density Filtering)
cohort.columns

Index(['Unnamed: 0', 'subject_id', 'hadm_id', 'osa', 'hf', 'group', 'new_text',
       'char_count', 'word_count'],
      dtype='object')

In [45]:
# Load scispaCy with UMLS linker
nlp = spacy.load("en_core_sci_sm")
linker = UmlsEntityLinker(resolve_abbreviations=True)
nlp.add_pipe(linker)

OSError: [E050] Can't find model 'en_core_sci_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [42]:
nlp.add_pipe("negex", config={"neg_termset": "en_clinical"})
cohort['neg_text'] = cohort['norm_text'].apply(tag_negations)

NameError: name 'nlp' is not defined