<a href="https://colab.research.google.com/github/francoisjaulin/activelearning/blob/master/DAT_ICU_Active_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%set_env GOOGLE_CLOUD_PROJECT chc-mimic-analysis


from google.colab import auth
auth.authenticate_user()
!touch ~/.bigqueryrc



In [0]:
import pandas as pd

# Silence some log spam from this library.`
import logging
logging.getLogger('googleapiclient').setLevel(logging.CRITICAL)

Query Measurements Per Person Per Visit

Note that due many of the lab visits do not have a visit_occurrence_id

In [0]:
def query(query_str, input_arg):
  query_str = query_str.format(input_arg)
  df = pd.read_gbq(query_str, project_id='chc-mimic-analysis', verbose=False, dialect='standard')
  return df

In [0]:
# Add other measurements needed for prediction.

measurement_names = [
    '"Troponin I.cardiac [Mass/volume] in Serum or Plasma"',
    '"Central venous pressure (CVP)"',
]

person_id = 32790
measurement = '"Troponin I.cardiac [Mass/volume] in Serum or Plasma"'

query_str = '''
  SELECT
    person_id,
    visit_occurrence_id,
    c_measurement_concept_name,
    value_as_number,
    unit_source_value
  FROM (
    SELECT
      s.*,
      t.concept_id AS c_measurement_concept_id,
      t.concept_name AS c_measurement_concept_name
    FROM
      `chc-mimic-data.mimic3_omop.measurement` AS s
    INNER JOIN
      `chc-mimic-data.vocab.concept` AS t
    ON
      s.measurement_concept_id = t.concept_id
    WHERE
      unit_source_value IS NOT NULL 
      AND t.concept_name = {0}
      )
  ORDER BY person_id, visit_occurrence_id, c_measurement_concept_name
'''

df = query(query_str, measurement_names[0])



In [0]:
# Group By Person Occurrence ID

df['person_occurrence_id'] = df.person_id.astype(str).str.cat(df.visit_occurrence_id.astype(str), sep=':')
df_features = df.groupby("person_occurrence_id")["value_as_number"].max().reset_index(name = "measurement_max_value")
df_features.head(20)

Unnamed: 0,person_occurrence_id,measurement_max_value
0,10004:279204,20.0
1,1000:258326,45.0
2,10011:232110,16.0
3,10012:277998,17.0
4,10013:264446,31.0
5,10015:230062,23.0
6,10027:286020,22.0
7,10028:211875,16.0
8,10029:226055,43.0
9,10032:267090,17.0


In [0]:
# Install library and build model
!pip install sklearn-pandas

Collecting sklearn-pandas
  Downloading sklearn_pandas-1.6.0-py2.py3-none-any.whl
Installing collected packages: sklearn-pandas
Successfully installed sklearn-pandas-1.6.0


In [0]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import GaussianRandomProjection
from sklearn.preprocessing import MinMaxScaler
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [0]:
# Add Feature Transformation.
#mapper = DataFrameMapper([
#   ('measurement_max_value', None)
#   ])
#output = mapper.fit_transform(df_features)

output = df_features.measurement_max_value
output = output.reshape(-1, 1)
output = np.nan_to_num(output)

# Select Initial Labels. The name is <person_id>:<visit_occurrence_id>

label= {'10004:279204' : 1}
count=len(df_features.person_occurrence_id)

print('number of samples %d' % count)

labels=np.zeros(count)

# Make the labels from the positives found.
for i in range(0, count):
  nid = df_features.person_occurrence_id[i]
  if nid in label:
   labels[i] = label[nid]
    
model = LogisticRegression(max_iter=1)
model.fit(output, labels)
predictions = model.predict_proba(output)

# Find the next sample for active learning
# This policy finds the most uncertain.
#candidate = -1
#candidate_score = 100
#for i in range(0, count):
#  if df.note_id[i] not in label:
#     score = abs(predictions[i][0] - 0.5)
#     if score < candidate_score:
#       candidate_score = score
#       candidate = i

# This policy finds the largest score
candidate = -1
candidate_score = 0
for i in range(0, count):
  if df.person_occurrence_id[i] not in label:
     score = predictions[i][0]
     if score > candidate_score:
       print(candidate_score)
       candidate_score = score
       candidate = i

print (labels[:10])
print (predictions[:10])
print ('next candidate is note_id %s with score %d' % (df_features.person_occurrence_id[candidate], candidate_score))

  This is separate from the ipykernel package so we can avoid doing imports until


number of samples 12349
0
0.8193982730652659
0.9676535893541223
0.9758764231332402
1.0
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[0.81939827 0.18060173]
 [0.96765359 0.03234641]
 [0.77038756 0.22961244]
 [0.78345999 0.21654001]
 [0.91230887 0.08769113]
 [0.8505101  0.1494899 ]
 [0.84066223 0.15933777]
 [0.77038756 0.22961244]
 [0.96258258 0.03741742]
 [0.78345999 0.21654001]]
next candidate is note_id 10168:278075 with score 0


pd.read_gbq('select count(*) from `chc-mimic-data.mimic3_aphp.person`', project_id='chc-mimic-analysis', verbose=False, dialect='standard')

In [0]:
#df = pd.read_gbq('SELECT n.* FROM `chc-mimic-data.mimic3_aphp.note` as n inner join `chc-mimic-data.mimic3_aphp.person` as p ON n.person_id = p.person_id AND p.year_of_birth < 2000', project_id='chc-mimic-analysis', verbose=False, dialect='standard')
df = pd.read_gbq('SELECT * FROM `chc-mimic-data.mimic3_aphp.note` as n LIMIT 10000', project_id='chc-mimic-analysis', verbose=False, dialect='standard')




In [0]:
df.head(3)

Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,note_source_value,visit_detail_id
0,55916380,62109874,2200-02-08,NaT,44814642,0,Report,Sinus tachycardia\nLow limb lead QRS voltages\...,0,40639385,,,ECG,
1,55774791,62090573,2125-11-13,NaT,44814642,0,Report,Baseline artiact. Sinus rhythm. Late R wave...,0,40639385,,,ECG,
2,55787713,62109307,2115-05-05,NaT,44814642,0,Report,Normal sinus rhythm. T wave inversions in lea...,0,40639385,,,ECG,


In [0]:

# Everything that contains tachycardia
df2 = df[df['note_text'].str.contains("tachycardia")]
coarse_label =  { id:1 for id in df2.note_id }
print (len(coarse_label))
df2.head(3)

1761


Unnamed: 0,note_id,person_id,note_date,note_datetime,note_type_concept_id,note_class_concept_id,note_title,note_text,encoding_concept_id,language_concept_id,provider_id,visit_occurrence_id,note_source_value,visit_detail_id
0,55916380,62109874,2200-02-08,NaT,44814642,0,Report,Sinus tachycardia\nLow limb lead QRS voltages\...,0,40639385,,,ECG,
10,55871339,62089531,2131-08-30,NaT,44814642,0,Report,Atrial fibrillation with a rapid ventricular r...,0,40639385,,,ECG,
12,55879509,62109320,2177-05-14,NaT,44814642,0,Report,Moderate baseline artifact. It is difficult t...,0,40639385,,,ECG,


In [0]:
!pip install sklearn-pandas



In [0]:
from sklearn_pandas import DataFrameMapper
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.random_projection import GaussianRandomProjection
import numpy as np


# Create the feacture vector for the notes.
vectorizer = CountVectorizer(min_df=2)
tfidf_mapper = DataFrameMapper([
   ('note_text', [vectorizer,TfidfTransformer()] )
   ])
tfidf = tfidf_mapper.fit_transform(df)
# Make the labels.
#label_manual = {55768615 : 0, 55905280 : 0, 55889502 : 0}
#label = dict( list(coarse_label.items()) + list(label_manual.items()) )
#label = {55916380 : 1}
label = {55916380 : 1, 55907059 : 0, 55898675 : 0, 55895646 : 0, 55913717 : 0, 55841543 : 0}

count = len(df.note_id)

labels = np.zeros(count)

# Make the labels from the positives found.
for i in range(0, count):
 nid = df.note_id[i]
 if nid in label:
   labels[i] = label[nid]
  
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Train a model for the active learning.
#model = SGDClassifier(loss='hinge', penalty='none',
#                     alpha=0.00, random_state=42,
#                     learning_rate='constant',
#                     eta0=1.0, fit_intercept=False, class_weight='balanced')
model = LogisticRegression(max_iter=1)
#model = RandomForestClassifier(max_depth=8, min_samples_leaf=2)
model.fit(tfidf, labels)
predictions = model.predict_proba(tfidf)

# Print the top 10 vocabulary
import operator
coefs = model.coef_[0]
index_coef = sorted( enumerate(coefs), key=operator.itemgetter(1))
model_size = len(index_coef)
print (index_coef[-1])
index_token = { index:token for token, index in vectorizer.vocabulary_.items() }
largest_index_list, largest_coef_list = zip( *index_coef )
largest_index_list = largest_index_list[-10:]
print (largest_index_list)
print( largest_coef_list[-10:])
print ([index_token[index] for index in largest_index_list])

# Find the next sample for active learning
# This policy finds the most uncertain.
#candidate = -1
#candidate_score = 100
#for i in range(0, count):
#  if df.note_id[i] not in label:
#     score = abs(predictions[i][0] - 0.5)
#     if score < candidate_score:
#       candidate_score = score
#       candidate = i

# This policy finds the largest score
candidate = -1
candidate_score = 0
for i in range(0, count):
  if df.note_id[i] not in label:
     score = predictions[i][0]
     if score > candidate_score:
       candidate_score = score
       candidate = i
      
print (labels[:10])
print (predictions[:10])
print ('next candidate is note_id %d with score %s' % (df.note_id[candidate], candidate_score))
print (df.note_text[candidate])



(448, -7.154425381177548e-05)
(658, 334, 569, 376, 831, 327, 344, 980, 299, 448)
(-8.799776675255798e-05, -8.57926249759199e-05, -8.516203181179935e-05, -8.397755224623552e-05, -8.317146365079811e-05, -8.276237924810359e-05, -8.155700938934391e-05, -8.147290562491681e-05, -7.44114906286239e-05, -7.154425381177548e-05)
['fulfilled', 'artery', 'easily', 'beginning', 'measured', 'approximate', 'association', 'phase', 'altogether', 'conducting']
[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[[0.88084945 0.11915055]
 [0.87967547 0.12032453]
 [0.88182431 0.11817569]
 [0.87607135 0.12392865]
 [0.87983968 0.12016032]
 [0.8790813  0.1209187 ]
 [0.89168967 0.10831033]
 [0.87824083 0.12175917]
 [0.88156309 0.11843691]
 [0.87625436 0.12374564]]
next candidate is note_id 55810120 with score 0.895021237935905
Artifact is present.  Sinus rhythm.  Right bundle-branch block.  There are
non-diagnostic Q waves in the inferior leads.  Non-specific ST-T wave changes.
Compared to the previous tracing there is no signific

In [0]:
# Print the top 10 vocabulary
import operator
coefs = model.coef_[0]
index_coef = sorted( enumerate(coefs), key=operator.itemgetter(1))
model_size = len(index_coef)
print (index_coef[-1])
index_token = { index:token for token, index in vectorizer.vocabulary_.items() }
largest_index_list, largest_coef_list = zip( *index_coef )
largest_index_list = largest_index_list[-10:]
print (largest_index_list)
print( largest_coef_list[-10:])
print ([index_token[index] for index in largest_index_list])



(663, 3.078645526813331e-05)
(2631, 3644, 3790, 1740, 1397, 5567, 5496, 2717, 410, 663)
(-6.338658248330362e-05, -6.338658248330362e-05, -6.338658248330362e-05, -5.7403106166100744e-05, -5.675202462560105e-05, -5.16340370512641e-05, -5.11760842866709e-05, -2.61057350408056e-05, 2.1411983988304607e-05, 3.078645526813331e-05)
['freeca', 'metas', 'myelos', 'connected', 'bump', 'ten', 'syrup', 'glasses', '304', '719']


In [0]:
vectorizer.vocabulary_


{'age': 889,
 'over': 4092,
 '90': 743,
 'man': 3556,
 'with': 6114,
 'cad': 1416,
 'cabg': 1414,
 'avr': 1173,
 '58': 587,
 'years': 6165,
 'prior': 4467,
 'and': 975,
 'mi': 3657,
 '2158': 298,
 'as': 1092,
 'well': 6077,
 'gerd': 2701,
 'prostate': 4515,
 'ca': 1413,
 'gib': 2708,
 'initially': 3148,
 'admitted': 863,
 'from': 2642,
 'rehab': 4757,
 'to': 5669,
 'osh': 4072,
 'on': 4019,
 '14': 128,
 'weakness': 6062,
 'lethargy': 3399,
 'found': 2619,
 'have': 2816,
 'fevers': 2509,
 'mrsa': 3755,
 'bacteremia': 1190,
 'epidural': 2338,
 'ess': 2358,
 'at': 1127,
 't4': 5508,
 'cord': 1801,
 'compression': 1711,
 't2': 5506,
 't3': 5507,
 'diskitis': 2105,
 'osteomyelitis': 4076,
 'hospital': 2929,
 'chf': 1550,
 'requiring': 4836,
 'diuresis': 2130,
 'cxr': 1892,
 'consistent': 1749,
 'pulmonary': 4559,
 'edema': 2254,
 'per': 4223,
 'he': 2830,
 'had': 2798,
 'been': 1241,
 'relatively': 4767,
 'independent': 3092,
 'adls': 856,
 'until': 5868,
 '10': 37,
 'when': 6089,
 'was': 6