# EHR Data Profiler
## Run the next cell to make all the imports, which include Pandas and the EHR data anaylsis functions:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from lib.ehr_dp_lib import *

## The following cells in this notebook are auto-generated from the data in the 'Data' folder
## For each table a Pandas dataframe is created to connect to each table

### Below is a list of the EHR Data Profiler functions, arguments, and descriptions:

- **check_dups( *dataframe name* )**:  Checks duplicate rows in dataframe, if none found outputs 'No duplicates' otherwise it returns a dataframe of the duplicate rows.


- **missingness( *dataframe name* )**: Returns a dataframe of the number of null values per column.


- **catbar( *dataframe name, column name, graph=(True or False)*)**: \[Generated on *categorical* data type only\] Returns a dataframe of counts of all the groups of categories in the specific column in the dataframe. When `graph` argument set to `True` returns a bar graph.


- **numstats( *dataframe name, column name* )**: \[Generated on *number* data type only\] Returns a dataframe of descriptive statistics (ie. mean, max, min, median, quartiles) for the column data.


- **dateline( *dataframe name, column name* )**: \[Generated on *date* data type only\] Returns a line graph of the freuency of specific dates along an x-axis of time.


- **flow_stats( *flowsheet dataframe* )**: \[Generated only if Flowsheet_Vitals.csv table in Data folder\] Returns a dataframe of descriptive statistics for common vitals sign types (ie. Height, Weight, Temperature, Sp02, Pulse, BMI, Respirations).


- **lab_stats( *lab dataframe, top=(10 or greater)* )**: \[Generated only if Labs.csv table in Data folder\] Returns a dataframe of descriptive statistics for top lab procedures in dataset. The `top` argument can be adjusted to capture more lab procedures.

## Using TEXT_SEARCH

### Another useful function included is 'text_search'. It is useful way to search specific columns in dataframes for text and return only those rows that contain the text.

- **text_search( *dataframe name, column name, text to search, ignore case=(True by default can also be set to False)* )**


### Example:
If you wanted to search Patient Demographics data for patients whose 'ETHNICITY' contains the text 'latino' using text_search:
`text_search(patient_demographics_df, 'ETHNICITY', 'latino')`

Result:
![latino_search.PNG](lib/latino_search.PNG)


## Combining TEXT_SEARCH with other functions:
### You can also combine functions to get the a specific analytical calculation. 

### Example:
If you wanted to get a set of counts of the categories in `SEX` of the patients (ie. Male, Female) in the previous dataset of 'latino'. First, you would need to assign the result of the `text_searxh` to a new value, in this case `latino_pats`:

`latino_pats = text_search(patient_demographics_df, 'ETHNICITY', 'latino')
catbar(latino_pats, 'SEX', graph='True')`

Result:
![latino_gender_search.PNG](lib/latino_gender_search.PNG)


## PATIENT_DEMOGRAPHICS

In [None]:
patient_demographics_df = pd.read_csv('Data/Patient_Demographics.csv')
patient_demographics_df

In [None]:
check_dups(patient_demographics_df)

In [None]:
missingness(patient_demographics_df)

In [None]:
catbar(patient_demographics_df, 'LANGUAGE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(patient_demographics_df, 'SEX', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(patient_demographics_df, 'MARITAL_STATUS', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(patient_demographics_df, 'ETHNICITY', graph=False) ## Set graph=True for Bar graph

In [None]:
numstats(patient_demographics_df, 'AGE')

In [None]:
catbar(patient_demographics_df, 'RELIGION', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(patient_demographics_df, 'RACE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(patient_demographics_df, 'SEXUAL_ORIENTATION', graph=False) ## Set graph=True for Bar graph

## ENCOUNTERS

In [None]:
encounters_df = pd.read_csv('Data/Encounters.csv')
encounters_df

In [None]:
check_dups(encounters_df)

In [None]:
missingness(encounters_df)

In [None]:
dateline(encounters_df, 'ENCOUNTER_DATE')

In [None]:
numstats(encounters_df, 'ENCOUNTER_AGE')

In [None]:
catbar(encounters_df, 'EPIC_ENCOUNTER_TYPE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'IP_VISIT_TYPE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'EPIC_DEPARTMENT_NAME', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'HOSP_DISCHARGE_DISPOSITION', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'ED_DISPOSITION', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'DEPARTMENT_SPECIALTY', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounters_df, 'LOCATION', graph=False) ## Set graph=True for Bar graph

## ENCOUNTER_DIAGNOSES

In [None]:
encounter_diagnoses_df = pd.read_csv('Data/Encounter_Diagnoses.csv')
encounter_diagnoses_df

In [None]:
check_dups(encounter_diagnoses_df)

In [None]:
missingness(encounter_diagnoses_df)

In [None]:
dateline(encounter_diagnoses_df, 'DIAGNOSIS_DATE')

In [None]:
catbar(encounter_diagnoses_df, 'PRESENT_ON_ADMISSION', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounter_diagnoses_df, 'ADMISSION_DIAGNOSIS_FLAG', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounter_diagnoses_df, 'HOSPITAL_FINAL_DIAGNOSIS', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(encounter_diagnoses_df, 'PRIMARY_DIAGNOSIS_FLAG', graph=False) ## Set graph=True for Bar graph

## PROCEDURES

In [None]:
procedures_df = pd.read_csv('Data/Procedures.csv')
procedures_df

In [None]:
check_dups(procedures_df)

In [None]:
missingness(procedures_df)

In [None]:
dateline(procedures_df, 'PROCEDURE_DATE')

In [None]:
catbar(procedures_df, 'PROCEDURE_DESCRIPTION', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(procedures_df, 'PROCEDURE_CODE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(procedures_df, 'PROCEDURE_TYPE', graph=False) ## Set graph=True for Bar graph

## FLOWSHEET_VITALS

In [None]:
flowsheet_vitals_df = pd.read_csv('Data/Flowsheet_Vitals.csv')
flowsheet_vitals_df

In [None]:
check_dups(flowsheet_vitals_df)

In [None]:
missingness(flowsheet_vitals_df)

In [None]:
dateline(flowsheet_vitals_df, 'VITAL_SIGN_TAKEN_TIME')

In [None]:
catbar(flowsheet_vitals_df, 'VITAL_SIGN_TYPE', graph=False) ## Set graph=True for Bar graph

In [None]:
flow_stats(flowsheet_vitals_df)

## LABS

In [None]:
labs_df = pd.read_csv('Data/Labs.csv')
labs_df

In [None]:
check_dups(labs_df)

In [None]:
missingness(labs_df)

In [None]:
dateline(labs_df, 'ORDER_TIME')

In [None]:
catbar(labs_df, 'PROCEDURE_CODE', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(labs_df, 'COMPONENT_NAME', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(labs_df, 'PROCEDURE_DESCRIPTION', graph=False) ## Set graph=True for Bar graph

In [None]:
lab_stats(labs_df, top=10)

## MEDICATIONS

In [None]:
medications_df = pd.read_csv('Data/Medications.csv')
medications_df

In [None]:
check_dups(medications_df)

In [None]:
missingness(medications_df)

In [None]:
dateline(medications_df, 'ORDER_DATE')

In [None]:
catbar(medications_df, 'EPIC_MEDICATION_NAME', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(medications_df, 'MEDISPAN_GENERIC_NAME', graph=False) ## Set graph=True for Bar graph

In [None]:
catbar(medications_df, 'MEDISPAN_CLASS_NAME', graph=False) ## Set graph=True for Bar graph