In [1]:
import pandas as pd
import re 
import nltk 
import sqlite3
import numpy as np
import ast
import string 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
conn = sqlite3.connect('lowva\lowva.db')
conn.text_factory = str
cur = conn.cursor()

dfcohort=pd.read_sql_query('''select outcome.pat_deid, note_deid, enc_date, lowvadate from outcome, notes where outcome.pat_deid=notes.pat_deid''', conn)
dfcohort["lowvadate"]=pd.to_datetime(dfcohort["lowvadate"])
dfcohort["enc_date"]=pd.to_datetime(dfcohort["enc_date"])

In [2]:
dfterms=pd.read_csv("lowva/lowva-clampoutput.csv")
dfterms.columns = map(str.lower, dfterms.columns)
dfterms=dfterms[["note_deid","cui", "assertion", "section"]]

In [3]:
dfterms.head()

Unnamed: 0,note_deid,cui,assertion,section
0,28528746.0,C0154863,present,other
1,28528746.0,C0154830,present,plan
2,645212689.0,C0746467,present,current medications
3,645212689.0,C0003969,present,current medications
4,645212689.0,C0306488,present,current medications


sectionlist=['allergy',
 'ap',
 'assessment',
 'attestation',
 'call',
 'chief complaint',
 'current medications',
 'encounter diagnoses',
 'exam',
 'eye exam',
 'eye medications',
 'family history',
 'findings',
 'hpi',
 'informed consent',
 'instruction',
 'interpretation',
 'medications',
 'ocular history',
 'other',
 'past surgical history',
 'plan',
 'pmh',
 'problem list',
 'risk',
 'ros',
 'sle',
 'social history',
 'visual acuity']

In [4]:
#delete terms that are in these irrelevant sections
dfterms=dfterms[~dfterms['section'].isin(['call', 'attestation', 'informed consent', 'instruction', 'risk', 'ros'])]

## check which are the most common terms

In [40]:
#check which are the most common terms types 
pd.Series(dfterms["cui"].value_counts(), name="counts")[0:500]

C0582103    48224
C0031809    23907
C0042812    23852
C0746467    23352
C0013227    20934
            ...  
C0339505      630
C0592218      630
C3544111      626
C1720208      624
C0023901      623
Name: counts, Length: 500, dtype: int64

In [50]:
umls=pd.read_table('umls-2021AA-mrconso/MRCONSO.RRF', sep='|', encoding='latin-1', index_col=False, 
                   names=["cui", "lat", "ts", "lui", "stt", "sui", "ispref", "aui", "saui", "scui", "sdui", 
                           "sab", "tty", "code", "desc", "srl", "suppress"], skiprows=1)[["cui", "lat", "desc"]]

  if (await self.run_code(code, result,  async_=asy)):


In [51]:
umls = umls.set_index("cui")
umls.head()

Unnamed: 0_level_0,lat,desc
cui,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000005,ENG,(131)I-MAA
C0000005,FRE,MacroagrÃ©gats d'albumine marquÃ©e Ã l'iode 131
C0000005,FRE,MAA-I 131
C0000005,FRE,MacroagrÃ©gats d'albumine humaine marquÃ©e Ã ...
C0000039,CZE,"1,2-dipalmitoylfosfatidylcholin"


In [55]:
umls = umls[umls["lat"]=="ENG"]
umls["desc"]=umls["desc"].str.lower()
umls=umls.drop_duplicates()
umls.head()

Unnamed: 0_level_0,lat,desc
cui,Unnamed: 1_level_1,Unnamed: 2_level_1
C0000005,ENG,(131)i-maa
C0000039,ENG,"1,2-dipalmitoylphosphatidylcholine"
C0000039,ENG,"1,2 dipalmitoylphosphatidylcholine"
C0000039,ENG,"1,2-dihexadecyl-sn-glycerophosphocholine"
C0000039,ENG,"1,2 dihexadecyl sn glycerophosphocholine"


In [58]:
dftopcuis=pd.merge(pd.Series(dfterms["cui"].value_counts()[0:500], name="counts"), umls["desc"], left_index=True, right_index=True, how="left")

In [59]:
dftopcuis=dftopcuis.reset_index()
dftopcuis=dftopcuis.drop_duplicates(["index"])
dftopcuis=dftopcuis.sort_values(by="counts", ascending=False)

In [62]:
dftopcuis.to_csv("lowva/lowva-clamptopcuis.csv", index=False)

# process terms for our cohort

In [15]:
dfcohort=dfcohort[dfcohort["enc_date"]<dfcohort["lowvadate"]]
dfcohort.head()

Unnamed: 0,pat_deid,note_deid,enc_date,lowvadate
0,1174,28528746.0,2009-01-12,2009-09-17
1,1174,28528751.0,2009-01-12,2009-09-17
6,2262,193611450.0,2012-05-10,2012-05-15
25,3178,227867100.0,2012-10-24,2016-11-11
26,3178,227870944.0,2012-10-24,2016-11-11


In [16]:
dfterms=pd.merge(dfcohort,dfterms,on='note_deid')[["pat_deid", "cui", "assertion", "section"]]

In [17]:
dfterms.loc[dfterms.assertion == 'present', 'assertion'] = 'pos'
dfterms.loc[dfterms.assertion == 'absent', 'assertion'] = 'neg'

dfterms["fh"]= 'pt'
dfterms.loc[dfterms.section == 'family history', 'fh'] = 'f'

dfterms['pivotvalue'] = 1

dfterms.head()

Unnamed: 0,pat_deid,cui,assertion,section,fh,pivotvalue
0,1174,C0154863,pos,other,pt,1
1,1174,C0154830,pos,plan,pt,1
2,1174,C0582103,pos,exam,pt,1
3,1174,C0021888,pos,exam,pt,1
4,1174,C0348014,pos,exam,pt,1


In [19]:
dftermswide=dfterms.pivot_table(values='pivotvalue', index=['pat_deid'], columns=['cui', 'assertion', 'fh'], fill_value=0)

dftermswide.columns = [''.join((char for char in str(col) if char.isalnum())).strip() for col in dftermswide.columns.values]
dftermswide.columns

Index(['C0000723pospt', 'C0000729pospt', 'C0000734negpt', 'C0000734pospt',
       'C0000735pospt', 'C0000737negpt', 'C0000737pospt', 'C0000741pospt',
       'C0000765pospt', 'C0000768negpt',
       ...
       'C4280600negpt', 'C4280600posf', 'C4280600pospt', 'C4280682pospt',
       'C4280806pospt', 'C4280810pospt', 'C4280891pospt', 'C4280929pospt',
       'C4280983pospt', 'C4281066pospt'],
      dtype='object', length=31639)

In [20]:
from sklearn.feature_selection import VarianceThreshold
selector=VarianceThreshold(.99 * (1 - .99))
selector.fit_transform(np.array(dftermswide)).shape

def variance_threshold_selector(data, threshold=0.5):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

dftermsfiltered=variance_threshold_selector(dftermswide, .99 * (1 - .99))

(3068, 1663)

In [21]:
dftermsfiltered.reset_index(inplace=True)

In [22]:
dftermsfiltered.head()

Unnamed: 0,pat_deid,C0000970pospt,C0000981pospt,C0001367pospt,C0001425pospt,C0001645pospt,C0001807pospt,C0001863pospt,C0002418pospt,C0002475pospt,...,C4083748pospt,C4227446pospt,C4227990pospt,C4229480pospt,C4229663pospt,C4229741pospt,C4230069pospt,C4231531pospt,C4238961pospt,C4257080pospt
0,1174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2262,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3178,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4062,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6938,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
dftermsfiltered.to_csv('lowva/lowva-termsfeatures.csv', index=False)

In [24]:
conn.close()