# Part D - Identifying Obese


In [0]:
# First off - load all the silly python libraries we are going to need
import pandas as pd
import numpy as np
import os

from google.colab import auth
from google.cloud import bigquery
from google.colab import files

In [0]:
auth.authenticate_user() #This will allow you to authenticate access to BigQuery

In [0]:
#This is a method that executes a desired SQL query on the database
project_id='hst-953-2018'
os.environ["GOOGLE_CLOUD_PROJECT"]=project_id
# Read data from BigQuery into pandas dataframes.
def run_query(query):
  return pd.io.gbq.read_gbq(query, project_id=project_id, verbose=False, configuration={'query':{'useLegacySql': False}})

In [0]:
# Now load the data. In general you'd load the whole set of notes but that would take
# several minutes so for this example we're just going to use a subset
#notes = pd.read_csv('D.csv')
notes = run_query('''
    SELECT *
    FROM `physionet-data.mimiciii_notes.noteevents`
    WHERE CATEGORY = 'Discharge summary'
    ''')


In [0]:
notes.head()

In [0]:
# Load the gold standard
#gold = pd.read_csv('obese-gold.csv')
gold = run_query('''
    SELECT *  
    FROM `hst-953-2018.NLP_workshop.obese_gold` 
    ''')


In [0]:
# Here is the list of terms we are going to consider "good"
terms = ['obese']

In [0]:
# Now scan through all of the notes. Do any of the terms appear? If so stash the note 
# id for future use

matches = []

for index, row in notes.iterrows():
    if any(x in row['TEXT'] for x in terms):
        matches.append(row['SUBJECT_ID'])

print("Found " + str(len(matches)) + " matching notes.")

Found 4934 matching notes.


In [0]:
# For the patients in those notes, set "obese" true (1) in a the results
myscores = gold.copy()
myscores['obese_1'] = 0 # This sets them all to unknown

for subject_id in matches:
    myscores.loc[myscores["subject_id"] == subject_id,'obese_1'] = 1


In [0]:
# Compute your score

skipped = 0
truepositive = 0
falsepositive = 0
truenegative = 0
falsenegative = 0

for index, row in myscores.iterrows():
    if row['obese_1'] == 0:
        skipped = skipped + 1
    else:
        if row['obese_1'] == 1 and gold.loc[index]['obese_1'] == 1:
            truepositive = truepositive + 1
        elif row['obese_1'] == -1 and gold.loc[index]['obese_1'] == -1:
            truenegative = truenegative + 1
        elif row['obese_1'] == 1 and gold.loc[index]['obese_1'] == -1:
            falsepositive = falsepositive + 1
        elif row['obese_1'] == -1 and gold.loc[index]['obese_1'] == 1:
            falsenegative = falsenegative + 1

print ("Skipped:\t" + str(skipped))
print ("True Pos:\t" + str(truepositive))
print ("True Neg:\t" + str(truenegative))
print ("False Pos:\t" + str(falsepositive))
print ("False Neg:\t" + str(falsenegative))     
print ("SCORE:\t\t" + str(truepositive + truenegative - falsepositive - falsenegative))