# DATASET for Q1

In [78]:
import pandas as pd
import numpy as np
import datetime

In [79]:
#import all datasets
allergiesCSV = pd.read_csv("../data/allergies.csv")
observationsCSV = pd.read_csv("../data/observations.csv")
acsCSV = pd.read_csv("../data/ACS.csv")
careplansCSV = pd.read_csv("../data/careplans.csv")
encountersCSV = pd.read_csv("../data/encounters.csv")
immunizationsCSV = pd.read_csv("../data/immunizations.csv")
procedutesCSV = pd.read_csv("../data/procedures.csv")
zipCSV = pd.read_csv("../data/zip_to_zcta_2019.csv")
conditionsCSV = pd.read_csv("../data/conditions.csv")
medicationsCSV = pd.read_csv("../data/medications.csv")
patientsCSV = pd.read_csv("../data/patients.csv")

In [80]:
#we only need data from MA, cleaning for necessary data
massZipCSV = zipCSV[zipCSV.STATE == "MA"]                                 #contains only MA ZCTA 
zipAndIncome = massZipCSV.merge(acsCSV,left_on='ZCTA', right_on='GEO.id2')
zipAndIncome = zipAndIncome.replace(['2,500-',"***",'-','**'],-1)         #necessary for converting to int so it can operated on


In [81]:
zipDrop = ['ZIP_CODE','PO_NAME','STATE','ZIP_TYPE','zip_join_type','GEO.id','GEO.id2','GEO.display-label','HC01_EST_VC13','HC01_MOE_VC13','HC02_EST_VC13','HC02_MOE_VC13','HC03_EST_VC13','HC03_MOE_VC13','HC04_EST_VC13','HC04_MOE_VC13']
zaiMerge = zipAndIncome.copy().drop(zipDrop,axis=1)

#Values are imported as strings. Converted to ints to run regression
zaiMerge['HouseholdIncome'] = zipAndIncome['HC01_EST_VC13'].astype(int)
zaiMerge['HouseholdMOE'] = zipAndIncome['HC01_MOE_VC13'].astype(int)       #zaiMerge is a table with only ZCTA, Household Median Income, & its MOE

#zaiMerge['Family'] = zipAndIncome[['HC02_EST_VC13','HC02_MOE_VC13']].values.tolist()
#zaiMerge['Married'] = zipAndIncome[['HC03_EST_VC13','HC03_MOE_VC13']].values.tolist()
#zaiMerge['Nonfamily'] = zipAndIncome[['HC04_EST_VC13','HC04_MOE_VC13']].values.tolist()  #could be interesting for later use


In [82]:
patientsCSV = patientsCSV.drop(['Unnamed: 0', 'SSN','DRIVERS','PASSPORT','PREFIX','FIRST', 'LAST', 'SUFFIX', 'MAIDEN'],axis=1)    #removed data to clean data and make it visually easier to understand and to ensure privacy
patientsCSV['HOMEZIP'] = patientsCSV['ADDRESS'].map(lambda x:'0'+str(x)[-7:-3])    #lambda fxn grabs zip code. This works since the format an address is fixed

#maps each cell to respective income through matching zipcodes
patientsCSV['INCOME'] = patientsCSV['HOMEZIP'].map(lambda x: -1 if x =='0' or x not in zaiMerge['ZCTA'].unique() else zaiMerge.loc[zaiMerge['ZCTA']== x]['HouseholdIncome'].values[0])
patientsCSV['MOE'] = patientsCSV['HOMEZIP'].map(lambda x: -1 if x =='0' or x not in zaiMerge['ZCTA'].unique() else zaiMerge.loc[zaiMerge['ZCTA']== x]['HouseholdMOE'].values[0])

patientsCSV['DEATHDATE'] = patientsCSV['DEATHDATE'].fillna(0) 



In [83]:
emergencyCodes = [50849002,183460006,183452005,183478001,34285007,183495009,32485007,305408004,305411003]

emergency = pd.read_csv("../data/encounters.csv")
emergency = emergency.drop('Unnamed: 0', axis=1)

#Clean encounters that are too old or not considered an emergency visit
emergency = emergency[emergency['CODE'].isin(emergencyCodes)]
emergency = emergency[emergency['DATE'].between('2008', '2017')]

#last only has most recent encounters for each patient
last = emergency.drop_duplicates("PATIENT",keep='last')
last['DEATHDATE'] = last['PATIENT'].map(lambda x: patientsCSV[patientsCSV["ID"]==x]['DEATHDATE'].values[0])

#Maps date of death for each patient to their encounter
emergency['DEATHDATE'] = emergency['ID'].map(lambda x: last.loc[last['ID']==x]['DEATHDATE'].values[0] if x in last['ID'].unique() else 0)

#I noticed encounters.CSV was missing a lot of reason descriptions that conditions.CSV had,
#so this for loop fills in missing information
for index,row in emergency.iterrows():
    if isinstance(row['REASONDESCRIPTION'],float):                        #only applied to encounters that are not filled. will not refill a cell. 
        date = row['DATE']
        find = conditionsCSV[conditionsCSV['PATIENT'] == row['PATIENT']]  #find has all of rows that pertain to patient and reason narrows it to the particular encounter
        reason = find[find['START'] == date]['DESCRIPTION']               #assuming someone doesn't visit the hospital in the same day for a different reason
        code = find[find['START'] == date]['CODE']                        #also assuming the hospital records all visit and didn't visit double visits if it occured
        try:
            emergency['REASONDESCRIPTION'][index] = reason.values[0]
            emergency['REASONCODE'][index] = code.values[0]
        except:
            emergency['REASONDESCRIPTION'][index] = np.nan
            emergency['REASONCODE'][index] = np.nan





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documenta

In [84]:
#Converted to date_time in order to do calculations on the dates
emergency['DATE']=pd.to_datetime(emergency['DATE'])
emergency['DEATHDATE']=pd.to_datetime(emergency['DEATHDATE'])

#Finds amount of time between emergency visit and death
emergency['DEATHDIFFERENCE'] = emergency['DEATHDATE']-emergency['DATE']
#Dummy variable for all patients who passed away 1 year within visit
emergency['YearDeath'] = emergency['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=365) else 0)
emergency.loc[(emergency['DEATHDIFFERENCE'] < datetime.timedelta(days=0)),'DEATHDIFFERENCE'] = np.nan
#DEATH is a dummy variable for deaths post discharge, this removes deaths that happen on day 0 since that wouldn't be considered post discharge.
emergency['DEATH'] = emergency['DEATHDIFFERENCE'].map(lambda x: 0 if x == datetime.timedelta(days=0) or x!=x else 1)
#I think this is more indicative of mortality related to post-discharge. 
emergency['DEATH100'] = emergency['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=100) else 0)
emergency['DEATH60']  = emergency['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=60) else 0)
emergency['DEATH30']  = emergency['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=30) else 0)


In [85]:
LOW_LINE = 56763.2 #low income threshole
#Adding patient information for each encounter. For regression and analysis
emergency['RACE'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['RACE'].values[0])
emergency['AGE'] = emergency['PATIENT'].map(lambda x: int(patientsCSV.loc[patientsCSV['ID']==x]['AGE'].values[0]))
emergency['HOMEZIP'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['HOMEZIP'].values[0])
emergency['INCOME'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['INCOME'].values[0])
emergency['MOE'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['MOE'].values[0])

#Create dummy variables for race for regression
emergency['BLACK'] = emergency['RACE'].map(lambda x: 1 if x == 'black or african american' or x=='black' else 0)
emergency['HISPANIC'] = emergency['RACE'].map(lambda x: 1 if x == 'hispanic' else 0)
emergency['ASIAN'] = emergency['RACE'].map(lambda x: 1 if x == 'asian' else 0)
emergency['WHITE'] = emergency['RACE'].map(lambda x: 1 if x == 'white' else 0)

#Dummy variable for if patient is considered low income. Low income is 80% of state median
emergency['LOWINCOME'] = emergency['INCOME'].map(lambda x: 1 if x < LOW_LINE else 0)
#Replacing NA was necessary for the above lambda fxn, however NA is necessary for regression since 0 will affect the regression
emergency['INCOME'] = emergency['INCOME'].replace(-1,np.nan)

emergency['SENIOR'] = emergency['AGE'].map(lambda x: 1 if x>=65 else 0)
emergency['MINORITY'] = emergency['RACE'].map(lambda x: 1 if x=='asian' or x=='hispanic' or x=='black' or x=='black or african american' else 0)


In [91]:
emergency.to_csv('../data/EmergencyVisits.csv')

In [96]:
print(emergency.shape[0])
print(encountersCSV.shape[0])

989


# 1. Extra Analysis 

In [86]:
#Matches patient BMI to patient and encounter
BMI = observationsCSV[observationsCSV['DESCRIPTION']=='Body Mass Index']
BMI["VALUE"] = BMI['VALUE'].astype(float)
patients = BMI['PATIENT'].unique() 
emergency['BMI'] = emergency['PATIENT'].map(lambda x: BMI[BMI['PATIENT']==x]['VALUE'].values[0] if x in patients else -1)
                                                         

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [87]:
#Homeless dummy variable
homeless = observationsCSV[observationsCSV['DESCRIPTION']=='Housing status']
patients = homeless['PATIENT'].unique()
emergency['HOMELESS'] = emergency['PATIENT'].map(lambda x: 1 if x in patients else 0)        #Patients with a 'Housing status' description were all homeless



In [88]:
#Variable for Diastolic Blood Pressure
DBP = observationsCSV[observationsCSV['DESCRIPTION']=='Diastolic Blood Pressure']
patient = DBP['PATIENT'].unique()
emergency['DIASTOLICBP'] = emergency['PATIENT'].map(lambda x: float(DBP[DBP['PATIENT']==x]['VALUE'].values[0]) if x in patient else -1)
emergency['BADBP'] = emergency['DIASTOLICBP'].map(lambda x: 1 if x > 80 else 0)



In [89]:
#Variable for Calcium levels
Calcium = observationsCSV[observationsCSV['DESCRIPTION']=='Calcium']
patient = Calcium['PATIENT'].unique()
emergency['CALCIUM'] = emergency['PATIENT'].map(lambda x: float(Calcium[Calcium['PATIENT']==x]['VALUE'].values[0]) if x in patient else -1)


In [90]:
#Finds patients who got the flu shot within 6 months of their most recent visit to the hospital. 
emergency['FLU']=None        #1 if patient recieved flu shot w/in 6 months of visit, 0 otherwise
emergency['FLU_DIFF']=None   #difference between time of flu shot and time of visit, if 0 days then they visited the hospital for the flu shot/happened to get it

flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]   #CODE 140 is the code for influenza immunization                                                     
patients = flu['PATIENT'].unique()

for index, row in emergency.iterrows():       
    checked = 0     #keeps track of if the inner for loop found a matching immunization
    patientID = row['PATIENT']
    if patientID in patients:    #if patient is in the flu dataset, they had an immunization
        date = flu[flu['PATIENT']==patientID]['DATE']   #list of all dates they recieved influenza shot
        for d in date:
            d = datetime.datetime.strptime(d, '%Y-%m-%d')
            diff = row['DATE'] - d
            if datetime.timedelta(days=0) <= diff < datetime.timedelta(days=183):    #an influenza shot lasts around 6 months/183 days
                checked = 1
                emergency['FLU'][index]=1
                emergency['FLU_DIFF'][index]=diff    #can help us determine cause of visit if there is none (i.e. if =0 days, then the visit was for the shot)
        if checked == 0:
            emergency['FLU'][index]=0
            emergency['FLU_DIFF'][index]=-1
    else:
            emergency['FLU'][index]=0
            emergency['FLU_DIFF'][index]=-1
            
#I found that a lot of the encounter dates in encounters.csv coincided with 
#dates (found in immunizations.csv) that the patient got a flu shot. They all 
#coincided with Outpatient Encounters without ReasonDescriptions. So I assume 
#the reason they went was for the flu shot and not some hidden sickness/emergency

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# 2. Rates

## 2.1 Post Discharge Mortality Rate

In [24]:
seniors = emergency[emergency['SENIOR']==1]
notSeniors = emergency[emergency['SENIOR']==0]
minority = emergency[emergency['MINORITY']==1]
notMinority = emergency[emergency['MINORITY']==0]
hispanic = emergency[emergency['HISPANIC']==1]
notHispanic = emergency[emergency['HISPANIC']==0]
asian = emergency[emergency['ASIAN']==1]
notAsian = emergency[emergency['ASIAN']==0]
lowInc = emergency[emergency['LOWINCOME']==1]
notLowInc = emergency[emergency['LOWINCOME']==0]

#post-discharge 100 day mortalty rate for seniors
seniorMortRate100 = seniors[seniors['DEATH100']==1].shape[0]/seniors.shape[0]      #3.3%
notMortRate100 = notSeniors[notSeniors['DEATH100']==1].shape[0]/notSeniors.shape[0]    #0%

#post-discharge 1 year mortalty rate for seniors
seniorMortRate365 = seniors[seniors['YearDeath']==1].shape[0]/seniors.shape[0]       #7.1%
notMortRate365 = notSeniors[notSeniors['YearDeath']==1].shape[0]/notSeniors.shape[0]   #0.9%

#post-discharge 1 year mortalty rate for minorities
minorityMortRate = minority[minority['YearDeath']==1].shape[0]/minority.shape[0]       #2.1%
notMinorityMortRate = notMinority[notMinority['YearDeath']==1].shape[0]/notMinority.shape[0]  #1.1%

#post-discharge 100 day mortalty rate for hispanic patients
hispanicMortRate100 = hispanic[hispanic['DEATH100']==1].shape[0]/hispanic.shape[0]       #0.67%
notHispanicMortRate100 = notHispanic[notHispanic['DEATH100']==1].shape[0]/notHispanic.shape[0]   #0.41%

#post-discharge 1 year mortalty rate for hispanic patients
hispanicMortRate = hispanic[hispanic['YearDeath']==1].shape[0]/hispanic.shape[0]       #2%
notHispanicMortRate = notHispanic[notHispanic['YearDeath']==1].shape[0]/notHispanic.shape[0]   #2.1%

#post-discharge 100 day mortalty rate for asian patients
asianMortRate100 = asian[asian['DEATH100']==1].shape[0]/asian.shape[0]       #1.6%
notAsianMortRate100 = notAsian[notAsian['DEATH100']==1].shape[0]/notAsian.shape[0]   #0.54%

#post-discharge 1 year mortalty rate for asian patients
asianMortRate = asian[asian['YearDeath']==1].shape[0]/asian.shape[0]       #6.5%
notAsianMortRate = notAsian[notAsian['YearDeath']==1].shape[0]/notAsian.shape[0]   #1.7%

#post-discharge 100 day mortalty rate for low income patients
lowIncMortRate100 = lowInc[lowInc['DEATH100']==1].shape[0]/lowInc.shape[0]       #.49
notLowIncMortRate100 = notLowInc[notLowInc['DEATH100']==1].shape[0]/notLowInc.shape[0]   #0.69%

#post-discharge 1 year mortalty rate for low income patients
lowIncMortRate = lowInc[lowInc['YearDeath']==1].shape[0]/lowInc.shape[0]       #2%
notLowIncMortRate = notLowInc[notLowInc['YearDeath']==1].shape[0]/notLowInc.shape[0]   #2.1%


## 2.2 % of Minority, Low Income Patients who are Seniors

In [26]:
minority = emergency[emergency['MINORITY']==1]
minorityLow = minority[minority['LOWINCOME']==1]
fatalMinority = minorityLow[minorityLow['YearDeath']==1]
fatalMinority30 = minorityLow[minorityLow['DEATH30']==1]
senior = fatalMinority[fatalMinority['SENIOR']==1]
senior30 = fatalMinority30[fatalMinority30['SENIOR']==1]

percentSenior = senior.shape[0]/fatalMinority.shape[0]  #57.1% of minority, low income patients who died within 1 year of discharge were elderly
percentSenior30 = senior30.shape[0]/fatalMinority30.shape[0] #100% of minority, low income patients who passed away within 30 days of discharge were elderly


## 2.3 Characteristics of patients who passed away w/in 1 year of discharge

In [34]:
mortality = emergency[emergency['YearDeath']==1]

hispanicMort = mortality[mortality['HISPANIC']==1].shape[0]/mortality.shape[0]  #75% of patients who passed away w/in 1 year of discharge were hispanic
hispanicRatio = emergency[emergency['HISPANIC']==1].shape[0]/emergency.shape[0] #75.5%

seniorMort = mortality[mortality['SENIOR']==1].shape[0]/mortality.shape[0]      #65% of patients who passed away w/in 1 year of discharge were seniors
seniorRatio = emergency[emergency['SENIOR']==1].shape[0]/emergency.shape[0]     #While 65% of YearDeath patients were seniors, only 18.5% of emergency encounter patients are seniors 

incomeRatio = emergency[emergency['LOWINCOME']==1].shape[0]/emergency.shape[0]  #41.4%
incomeMort = mortality[mortality['LOWINCOME']==1].shape[0]/mortality.shape[0]   #40%

#6.2% of encounters are Asian, while 20% of mortalities were Asian
asianRatio = emergency[emergency['ASIAN']==1].shape[0]/emergency.shape[0]       #6.2%
asianMort = mortality[mortality['ASIAN']==1].shape[0]/mortality.shape[0]        #20%

#8.7% of encounters were black, yet 0% of 1 year discharge mortalities were black
blackRatio = emergency[emergency['BLACK']==1].shape[0]/emergency.shape[0]       #8.7%
blackMort = mortality[mortality['BLACK']==1].shape[0]/mortality.shape[0]        #0%

#90.4% of encounters were minority and even more (95%) of mortalities were minority
minorityRatio = emergency[emergency['MINORITY']==1].shape[0]/emergency.shape[0] #90.4%
minorityMort = mortality[mortality['MINORITY']==1].shape[0]/mortality.shape[0]  #95%


## 2.4 Characteristics of patients who passed away w/in 30 days of discharge

In [41]:
mortality = emergency[emergency['DEATH30']==1]

#Similar proportion of mortalities to population of Hispanics
hispanicMort = mortality[mortality['HISPANIC']==1].shape[0]/mortality.shape[0]  #75% of patients who passed away w/in 1 year of discharge were hispanic
hispanicRatio = emergency[emergency['HISPANIC']==1].shape[0]/emergency.shape[0] #75.5%

#While 18.5% of encounters were with seniors, 100% of 30day post discharge mortalities were seniors
seniorMort = mortality[mortality['SENIOR']==1].shape[0]/mortality.shape[0]      #100% 
seniorRatio = emergency[emergency['SENIOR']==1].shape[0]/emergency.shape[0]     #18.5%

#More than expected low income encounters passed away within 30 days of discharge
incomeRatio = emergency[emergency['LOWINCOME']==1].shape[0]/emergency.shape[0]  #41.4%
incomeMort = mortality[mortality['LOWINCOME']==1].shape[0]/mortality.shape[0]   #50%

#More asian patient mortalities than expected (6.2% vs 25%)
asianRatio = emergency[emergency['ASIAN']==1].shape[0]/emergency.shape[0]       #6.2%
asianMort = mortality[mortality['ASIAN']==1].shape[0]/mortality.shape[0]        #25%

#Less black patient mortalities than expected (8.7% vs 0%)
blackRatio = emergency[emergency['BLACK']==1].shape[0]/emergency.shape[0]       #8.7%
blackMort = mortality[mortality['BLACK']==1].shape[0]/mortality.shape[0]        #0%

#All 30-day post discharge mortalities were minorities
minorityRatio = emergency[emergency['MINORITY']==1].shape[0]/emergency.shape[0] #90.4%
minorityMort = mortality[mortality['MINORITY']==1].shape[0]/mortality.shape[0]  #100%


## 2.5 Characteristics of patients who passed away w/in 60 days of discharge

In [44]:
mortality = emergency[emergency['DEATH60']==1]

hispanicMort = mortality[mortality['HISPANIC']==1].shape[0]/mortality.shape[0]  #75% of patients who passed away w/in 1 year of discharge were hispanic
hispanicRatio = emergency[emergency['HISPANIC']==1].shape[0]/emergency.shape[0] #75.5%

#100% of 60day post discharge mortalitis were seniors, they only make up 18.5% of the encounters 
seniorMort = mortality[mortality['SENIOR']==1].shape[0]/mortality.shape[0]      #100% of patients who passed away w/in 1 year of discharge were seniors
seniorRatio = emergency[emergency['SENIOR']==1].shape[0]/emergency.shape[0]     #While 18.5% of YearDeath patients were seniors, only 31.3% of patients are seniors 

incomeRatio = emergency[emergency['LOWINCOME']==1].shape[0]/emergency.shape[0]  #41.4%
incomeMort = mortality[mortality['LOWINCOME']==1].shape[0]/mortality.shape[0]   #50%

#More asian patient mortalities than expected
asianRatio = emergency[emergency['ASIAN']==1].shape[0]/emergency.shape[0]       #6.2%
asianMort = mortality[mortality['ASIAN']==1].shape[0]/mortality.shape[0]        #25%

#Less black patient mortalities than expected
blackRatio = emergency[emergency['BLACK']==1].shape[0]/emergency.shape[0]       #8.7%
blackMort = mortality[mortality['BLACK']==1].shape[0]/mortality.shape[0]        #0% 

#All 60-day mortalities were minorities
minorityRatio = emergency[emergency['MINORITY']==1].shape[0]/emergency.shape[0] #90.4%
minorityMort = mortality[mortality['MINORITY']==1].shape[0]/mortality.shape[0]  #100%



## 3. Extra Information (based off of entire dataset (2008-16), not just emergencies)

In [52]:
encounters = pd.read_csv("../data/encounters.csv")
encounters = encounters.drop('Unnamed: 0', axis=1)

#Clean encounters that are too old 
encounters = encounters[encounters['DATE'].between('2008', '2017')]
encounters = encounters[encounters['CODE']!=308646001]

#last only has most recent encounters for each patient
last = encounters.drop_duplicates("PATIENT",keep='last')
last['DEATHDATE'] = last['PATIENT'].map(lambda x: patientsCSV[patientsCSV["ID"]==x]['DEATHDATE'].values[0])

#Maps date of death for each patient to their encounter
encounters['DEATHDATE'] = encounters['ID'].map(lambda x: last.loc[last['ID']==x]['DEATHDATE'].values[0] if x in last['ID'].unique() else 0)

#I noticed encounters.CSV was missing a lot of reason descriptions that conditions.CSV had,
#so this for loop fills in missing information
for index,row in encounters.iterrows():
    if isinstance(row['REASONDESCRIPTION'],float):                        #only applied to encounters that are not filled. will not refill a cell. 
        date = row['DATE']
        find = conditionsCSV[conditionsCSV['PATIENT'] == row['PATIENT']]  #find has all of rows that pertain to patient and reason narrows it to the particular encounter
        reason = find[find['START'] == date]['DESCRIPTION']               #assuming someone doesn't visit the hospital in the same day for a different reason
        code = find[find['START'] == date]['CODE']                        #also assuming the hospital records all visit and didn't visit double visits if it occured
        try:
            encounters['REASONDESCRIPTION'][index] = reason.values[0]
            encounters['REASONCODE'][index] = code.values[0]
        except:
            encounters['REASONDESCRIPTION'][index] = np.nan
            encounters['REASONCODE'][index] = np.nan
            
#Converted to date_time in order to do calculations on the dates
encounters['DATE']=pd.to_datetime(encounters['DATE'])
encounters['DEATHDATE']=pd.to_datetime(encounters['DEATHDATE'])

#Finds amount of time between emergency visit and death
encounters['DEATHDIFFERENCE'] = encounters['DEATHDATE']-encounters['DATE']
#Dummy variable for all patients who passed away 1 year within visit
encounters['YearDeath'] = encounters['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=365) else 0)
encounters.loc[(encounters['DEATHDIFFERENCE'] < datetime.timedelta(days=0)),'DEATHDIFFERENCE'] = np.nan
#DEATH is a dummy variable for deaths post discharge, this removes deaths that happen on day 0 since that wouldn't be considered post discharge.
encounters['DEATH'] = encounters['DEATHDIFFERENCE'].map(lambda x: 0 if x == datetime.timedelta(days=0) or x!=x else 1)
#I think this is more indicative of mortality related to post-discharge. 
encounters['DEATH100'] = encounters['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=100) else 0)
encounters['DEATH60']  = encounters['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=60) else 0)
encounters['DEATH30']  = encounters['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=30) else 0)

LOW_LINE = 56763.2 #low income threshole
#Adding patient information for each encounter. For regression and analysis
encounters['RACE'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['RACE'].values[0])
encounters['AGE'] = encounters['PATIENT'].map(lambda x: int(patientsCSV.loc[patientsCSV['ID']==x]['AGE'].values[0]))
encounters['HOMEZIP'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['HOMEZIP'].values[0])
encounters['INCOME'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['INCOME'].values[0])

#Dummy variable for if patient is considered low income. Low income is 80% of state median
encounters['LOWINCOME'] = encounters['INCOME'].map(lambda x: 1 if x < LOW_LINE else 0)
#Replacing NA was necessary for the above lambda fxn, however NA is necessary for regression since 0 will affect the regression
encounters['INCOME'] = encounters['INCOME'].replace(-1,np.nan)

encounters['SENIOR'] = encounters['AGE'].map(lambda x: 1 if x>=65 else 0)
encounters['MINORITY'] = encounters['RACE'].map(lambda x: 1 if x=='asian' or x=='hispanic' or x=='black' or x=='black or african american' else 0)

#Matches patient BMI to patient and encounter
BMI = observationsCSV[observationsCSV['DESCRIPTION']=='Body Mass Index']
BMI["VALUE"] = BMI['VALUE'].astype(float)
patients = BMI['PATIENT'].unique() 
encounters['BMI'] = encounters['PATIENT'].map(lambda x: BMI[BMI['PATIENT']==x]['VALUE'].values[0] if x in patients else -1)
                                                         
#Homeless dummy variable
homeless = observationsCSV[observationsCSV['DESCRIPTION']=='Housing status']
patients = homeless['PATIENT'].unique()
encounters['HOMELESS'] = encounters['PATIENT'].map(lambda x: 1 if x in patients else 0)        #Patients with a 'Housing status' description were all homeless

#Variable for Diastolic Blood Pressure
DBP = observationsCSV[observationsCSV['DESCRIPTION']=='Diastolic Blood Pressure']
patient = DBP['PATIENT'].unique()
encounters['DIASTOLICBP'] = encounters['PATIENT'].map(lambda x: float(DBP[DBP['PATIENT']==x]['VALUE'].values[0]) if x in patient else -1)
encounters['BADBP'] = encounters['DIASTOLICBP'].map(lambda x: 1 if x > 80 else 0)

#Variable for Calcium levels
Calcium = observationsCSV[observationsCSV['DESCRIPTION']=='Calcium']
patient = Calcium['PATIENT'].unique()
encounters['CALCIUM'] = encounters['PATIENT'].map(lambda x: float(Calcium[Calcium['PATIENT']==x]['VALUE'].values[0]) if x in patient else -1)

#Finds patients who got the flu shot within 6 months of their most recent visit to the hospital. 
encounters['FLU']=None        #1 if patient recieved flu shot w/in 6 months of visit, 0 otherwise
encounters['FLU_DIFF']=None   #difference between time of flu shot and time of visit, if 0 days then they visited the hospital for the flu shot/happened to get it

flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]   #CODE 140 is the code for influenza immunization                                                     
patients = flu['PATIENT'].unique()

for index, row in encounters.iterrows():       
    checked = 0     #keeps track of if the inner for loop found a matching immunization
    patientID = row['PATIENT']
    if patientID in patients:    #if patient is in the flu dataset, they had an immunization
        date = flu[flu['PATIENT']==patientID]['DATE']   #list of all dates they recieved influenza shot
        for d in date:
            d = datetime.datetime.strptime(d, '%Y-%m-%d')
            diff = row['DATE'] - d
            if datetime.timedelta(days=0) <= diff < datetime.timedelta(days=183):    #an influenza shot lasts around 6 months/183 days
                checked = 1
                encounters['FLU'][index]=1
                encounters['FLU_DIFF'][index]=diff    #can help us determine cause of visit if there is none (i.e. if =0 days, then the visit was for the shot)
        if checked == 0:
            encounters['FLU'][index]=0
            encounters['FLU_DIFF'][index]=-1
    else:
            encounters['FLU'][index]=0
            encounters['FLU_DIFF'][index]=-1
            
#I found that a lot of the encounter dates in encounters.csv coincided with 
#dates (found in immunizations.csv) that the patient got a flu shot. They all 
#coincided with Outpatient Encounters without ReasonDescriptions. So I assume 
#the reason they went was for the flu shot and not some hidden sickness/emergency

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [51]:
encounters.to_csv("En.csv")

#### 46% of senior patients who passed away w/in 30 days of their visit did not visit for care. They visited for a flu shot.
#### 39% of senior patients who passed away w/in 30 days of their visit passed away due to Pneumonia/Viral Illness.

In [56]:
seniorPatients = encounters[encounters['SENIOR']==1]
seniorPatients = seniorPatients.drop_duplicates('PATIENT',keep='last')
totalSeniors = seniorPatients.shape[0]
Death30 = seniorPatients[seniorPatients['DEATH30']==1]

print(str(Death30[Death30['FLU_DIFF']==datetime.timedelta(days=0)].shape[0]/Death30.shape[0] * 100) + "% of senior patients who passed away within 30 days of their hospital visit visited the hospital for a flu shot")
print(str((Death30[Death30['REASONDESCRIPTION']=='Pneumonia'].shape[0]+Death30[Death30['REASONDESCRIPTION']=='Viral sinusitis (disorder)'].shape[0])/Death30.shape[0]*100) + "% of senior patients who passed away within 30 days of visit, visited for Pneumonia/Viral Illness")

Death30







46.15384615384615% of senior patients who passed away within 30 days of their hospital visit visited the hospital for a flu shot
38.46153846153847% of senior patients who passed away within 30 days of visit, visited for Pneumonia/Viral Illness


Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,YearDeath,...,LOWINCOME,SENIOR,MINORITY,BMI,HOMELESS,DIASTOLICBP,BADBP,CALCIUM,FLU,FLU_DIFF
669,46b94251-1039-42a9-ae54-3ca75f25e75c,2012-07-07,c383f814-9ee4-4ca8-a0ff-ee1369f8d2ee,185349003,Outpatient Encounter,,,2012-08-02,26 days,1,...,0,1,1,37.4,0,72.0,0,-1.0,1,0 days 00:00:00
1924,d8ed4921-783c-4e10-bbc9-e8dbd5913c20,2013-04-09,6d66c6a6-8b0d-47c0-a573-b78ec8ceec63,185349003,Outpatient Encounter,,,2013-05-01,22 days,1,...,0,1,1,39.06,0,116.0,1,9.45,1,0 days 00:00:00
2149,b297ac62-23e4-4fe7-8f72-8277722eaf52,2009-02-25,481373a7-79df-429d-b3da-e971116c1df1,185349003,Outpatient Encounter,,,2009-03-18,21 days,1,...,1,1,1,30.24,0,111.0,1,10.09,1,0 days 00:00:00
3407,ca457bec-18f4-4ce0-81f9-0800b346428d,2016-07-13,bc6fbe62-116e-424f-943c-bae29fa9f319,185345009,Encounter for symptom,444814009.0,Viral sinusitis (disorder),2016-07-22,9 days,1,...,1,1,0,36.44,0,114.0,1,9.19,0,-1
4180,b8a9dcac-1b61-49ae-9e06-4c793be6e29d,2009-04-16,bc2f58a2-ea29-45e7-93c4-99c03cbfcc4c,185349003,Outpatient Encounter,,,2009-05-11,25 days,1,...,0,1,0,35.59,0,111.0,1,9.51,1,0 days 00:00:00
5761,508aebb9-5b6c-42fc-990d-3f399b7ba386,2011-11-02,90e7f959-f0ec-4e9e-bce8-ca0a7d0e6a1f,185349003,Outpatient Encounter,,,2011-11-12,10 days,1,...,0,1,1,31.78,0,75.0,0,9.3,1,0 days 00:00:00
6063,e7d1b9ab-a5c4-4d0b-ac86-f6582496195e,2014-01-25,cdbe1927-b954-4f37-9294-ac0ca277d147,34285007,Hospital admission,233604007.0,Pneumonia,2014-02-06,12 days,1,...,1,1,1,25.15,0,80.0,0,8.65,1,121 days 00:00:00
7120,977f504f-1ec5-45c7-aa16-b0010a62745f,2015-02-02,8b3cf0c5-affd-4cbb-a397-e63484ba4d47,34285007,Hospital admission,233604007.0,Pneumonia,2015-02-15,13 days,1,...,0,1,1,37.09,0,97.0,1,9.27,1,101 days 00:00:00
10994,bb71193a-8aa3-499a-8d18-223e50978588,2014-09-22,d871e7f4-b18d-4e4a-8fbb-f1d2c101efc5,185349003,Outpatient Encounter,,,2014-10-01,9 days,1,...,0,1,1,25.89,0,71.0,0,-1.0,1,0 days 00:00:00
11942,beb50d63-b424-4b33-8315-9c1e4a7ad479,2016-07-02,e6e63adb-9c2a-4f7a-ade9-3b89ff862a03,34285007,Hospital admission,233604007.0,Pneumonia,2016-07-10,8 days,1,...,0,1,1,35.13,0,111.0,1,9.04,0,-1


#### 57% of senior patients who passed away w/in 60 days of their hospital visit visited the hospital for a flu shot. An additional 32% passed away after visiting for Pneumonia/Viral Sinusitis.

In [59]:
Death60 = seniorPatients[seniorPatients['DEATH60']==1]

print(str(Death60[Death60['FLU_DIFF']==datetime.timedelta(days=0)].shape[0]/Death60.shape[0] * 100) + "% of senior patients who passed away within 60 days of their hospital visit visited the hospital for a flu shot")
print(str(6/Death60.shape[0]*100) +"% of senior patients passed away after visiting the hospital for a virus")
Death60

57.89473684210527% of senior patients who passed away within 60 days of their hospital visit visited the hospital for a flu shot
31.57894736842105% of senior patients passed away after visiting the hospital for a virus


Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,YearDeath,...,LOWINCOME,SENIOR,MINORITY,BMI,HOMELESS,DIASTOLICBP,BADBP,CALCIUM,FLU,FLU_DIFF
669,46b94251-1039-42a9-ae54-3ca75f25e75c,2012-07-07,c383f814-9ee4-4ca8-a0ff-ee1369f8d2ee,185349003,Outpatient Encounter,,,2012-08-02,26 days,1,...,0,1,1,37.4,0,72.0,0,-1.0,1,0 days 00:00:00
1924,d8ed4921-783c-4e10-bbc9-e8dbd5913c20,2013-04-09,6d66c6a6-8b0d-47c0-a573-b78ec8ceec63,185349003,Outpatient Encounter,,,2013-05-01,22 days,1,...,0,1,1,39.06,0,116.0,1,9.45,1,0 days 00:00:00
2149,b297ac62-23e4-4fe7-8f72-8277722eaf52,2009-02-25,481373a7-79df-429d-b3da-e971116c1df1,185349003,Outpatient Encounter,,,2009-03-18,21 days,1,...,1,1,1,30.24,0,111.0,1,10.09,1,0 days 00:00:00
3334,b57f2d3d-e9fc-4d21-be1e-e9b2f84fc714,2014-06-15,18a225df-378e-419a-8aff-bc03ab654103,185349003,Outpatient Encounter,,,2014-07-26,41 days,1,...,0,1,0,25.35,0,87.0,1,8.55,1,0 days 00:00:00
3407,ca457bec-18f4-4ce0-81f9-0800b346428d,2016-07-13,bc6fbe62-116e-424f-943c-bae29fa9f319,185345009,Encounter for symptom,444814009.0,Viral sinusitis (disorder),2016-07-22,9 days,1,...,1,1,0,36.44,0,114.0,1,9.19,0,-1
4180,b8a9dcac-1b61-49ae-9e06-4c793be6e29d,2009-04-16,bc2f58a2-ea29-45e7-93c4-99c03cbfcc4c,185349003,Outpatient Encounter,,,2009-05-11,25 days,1,...,0,1,0,35.59,0,111.0,1,9.51,1,0 days 00:00:00
4530,12b94e93-09fe-4dcc-98d0-ac67ce07e674,2009-04-03,a77d3e00-6f1a-4bfd-9333-3439be45b15f,185349003,Outpatient Encounter,,,2009-05-15,42 days,1,...,0,1,1,31.78,0,86.0,1,-1.0,1,0 days 00:00:00
5761,508aebb9-5b6c-42fc-990d-3f399b7ba386,2011-11-02,90e7f959-f0ec-4e9e-bce8-ca0a7d0e6a1f,185349003,Outpatient Encounter,,,2011-11-12,10 days,1,...,0,1,1,31.78,0,75.0,0,9.3,1,0 days 00:00:00
6063,e7d1b9ab-a5c4-4d0b-ac86-f6582496195e,2014-01-25,cdbe1927-b954-4f37-9294-ac0ca277d147,34285007,Hospital admission,233604007.0,Pneumonia,2014-02-06,12 days,1,...,1,1,1,25.15,0,80.0,0,8.65,1,121 days 00:00:00
6821,04e0b540-cfaf-4875-895f-8cdb3235a48e,2008-06-20,6ddea441-48d4-4ef1-8213-04ca25c78497,185349003,Outpatient Encounter,,,2008-07-24,34 days,1,...,0,1,1,27.58,0,88.0,1,-1.0,1,0 days 00:00:00


#### Finding areas/zipcodes that might benefit most/first from the program

In [501]:
yeardeath = encounters[encounters['DEATH']==1]
yeardeath = yeardeath.drop_duplicates('PATIENT',keep='last')

locations = yeardeath.groupby('HOMEZIP').count()
locations.sort_values('PATIENT').tail(10)

emergency_drop = encounters.drop_duplicates('PATIENT', keep='last')

m = emergency_drop[emergency_drop['MINORITY']==1]
m = m.groupby("HOMEZIP").count()
top = m.sort_values('PATIENT').tail(10)

#### 83.6% of YearDeath==1 seniors were not w/in the healthy BMI range for seniors. 

In [60]:
yearSeniors = seniorPatients[seniorPatients['YearDeath']==1]

totalYrSeniors = yearSeniors.shape[0]
under = yearSeniors[yearSeniors['BMI']<25].shape[0]
over =yearSeniors[yearSeniors['BMI']>27].shape[0]
print(str((under+over)/totalYrSeniors *100) +"% of 1 year mortality senior patients were not within the healthy BMI range for seniors")
print(str(under/totalYrSeniors*100)+"% of 1 year mortality senior patients were under the healthy BMI range for seniors")
print(str(over/totalYrSeniors*100)+"% of 1 year mortality senior patients were over the healthy BMI range for seniors")


yearSeniors = seniorPatients[seniorPatients['YearDeath']==1]

totalYrSeniors = yearSeniors.shape[0]
under = yearSeniors[yearSeniors['BMI']<25].shape[0]
over =yearSeniors[yearSeniors['BMI']>27].shape[0]
print(str((under+over)/totalYrSeniors *100) +"% of 1 year mortality senior patients were not within the healthy BMI range for seniors")
print(str(under/totalYrSeniors*100)+"% of 1 year mortality senior patients were under the healthy BMI range for seniors")
print(str(over/totalYrSeniors*100)+"% of 1 year mortality senior patients were over the healthy BMI range for seniors")




83.56164383561644% of 1 year mortality senior patients were not within the healthy BMI range for seniors
17.80821917808219% of 1 year mortality senior patients were under the healthy BMI range for seniors
65.75342465753424% of 1 year mortality senior patients were over the healthy BMI range for seniors
83.56164383561644% of 1 year mortality senior patients were not within the healthy BMI range for seniors
17.80821917808219% of 1 year mortality senior patients were under the healthy BMI range for seniors
65.75342465753424% of 1 year mortality senior patients were over the healthy BMI range for seniors


#### 66.9% of YearDeath==1 seniors did not have normal diastolic blood pressure.

In [62]:
print(str(yearSeniors[yearSeniors['BADBP']==1].shape[0]/totalYrSeniors*100)+"% of 1 year mortality seniors did not have normal diastolic blood pressure")
print(str(seniorPatients[seniorPatients['BADBP']==1].shape[0]/totalSeniors*100)+"% of the entire senior patient population does not have normal diastolic blood pressure")
print(str(yearSeniors[yearSeniors['DIASTOLICBP']>=90].shape[0]/totalYrSeniors*100) +"% of 1 year mortality seniors had high diastolic blood pressure")
print(str(seniorPatients[seniorPatients['DIASTOLICBP']>=90].shape[0]/totalSeniors*100) +"% of the entire senior patient population has high diastolic blood pressure")









64.38356164383562% of 1 year mortality seniors did not have normal diastolic blood pressure
62.447257383966246% of the entire senior patient population does not have normal diastolic blood pressure
35.61643835616438% of 1 year mortality seniors had high diastolic blood pressure
33.755274261603375% of the entire senior patient population has high diastolic blood pressure


#### 11/15 YearDeaths for minority, senior patients were related to curable viruses or injuries sustained from falling.

In [70]:
e = encounters[encounters['YearDeath']==1]
g = e[e['MINORITY']==1]
g = g[g['SENIOR']==1]
h = g.drop_duplicates('PATIENT',keep='last')
g = h.groupby('REASONDESCRIPTION').count()

print("8/14 of known reasons for passing was due to curable viruses and sicknesses")
print("3/14 of known reasons were related to injuries from falling")

8/14 of known reasons for passing was due to curable viruses and sicknesses
3/14 of known reasons were related to injuries from falling


#### The most recent hospital visit for 9/16 senior, minority, patients who passed away w/in 60 days of their visits went to the hospital for a flu shot

In [74]:
e = encounters[encounters['DEATH60']==1]
g = e[e['MINORITY']==1]
g = g[g['SENIOR']==1]
h = g.drop_duplicates('PATIENT',keep='last')
h.groupby('REASONDESCRIPTION').count()
print("9/16 senior, minority, patients who passed away w/in 30 days visited the hospital for flu shots")

9/16 senior, minority, patients who passed away w/in 30 days visited the hospital for flu shots


#### 92.8% of Outpatient Encounters were influenza immunizations/included an influenza immunization

In [76]:
e_drop = encounters.drop_duplicates("PATIENT", keep='last')
outpatient = e_drop[e_drop['CODE']==185349003]
outFluSize = outpatient[outpatient["FLU_DIFF"]==datetime.timedelta(days=0)].shape[0]
print(str(outFluSize/outpatient.shape[0]*100)+'% of Outpatient Encounters were/included an influenza immunization')

92.81767955801105% of Outpatient Encounters were/included an influenza immunization


#### Avg. Household Income for a Homeless Patient = 76,309.20. That's 34.4% more than the low income threshold.

In [95]:
homeless = observationsCSV[observationsCSV['DESCRIPTION']=='Housing status']

homeless = homeless.drop_duplicates('PATIENT', keep='last')
homeless['INCOME'] = homeless['PATIENT'].map(lambda x: patientsCSV[patientsCSV['ID']==x]["INCOME"].values[0])
avg = homeless['INCOME'].sum()/25


print('The average estimated household income for a homeless patient is $' +str(avg) +' which is ' +str((avg-LOW_LINE)/LOW_LINE*100)+"% more than the low income threshold")

                                                        
                                                                   
                                                                   

The average estimated household income for a homeless patient is $76309.28 which is 34.43442230177298% more than the low income threshold


Unnamed: 0.1,Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS,INCOME
202,202,2013-12-16,96b24072-e1fe-49cd-a22a-6dfb92c3994c,ab703308-e57a-4aa9-b8ef-d0fb07b60ab4,71802-3,Housing status,Patient is homeless,{nominal},102219
823,823,2014-07-04,38364c57-80ce-4749-aed6-878cdff95379,ddde7e19-206f-455f-99c0-98f73def8965,71802-3,Housing status,Patient is homeless,{nominal},60755
1465,1465,2007-11-04,abf99602-7cb7-49db-9251-0499968c472f,fa874ebb-3cf7-4140-a168-806c98720029,71802-3,Housing status,Patient is homeless,{nominal},-1
3986,3986,2008-05-20,9943efb3-15d2-4225-ba54-0cb4d4478f6a,a1b7f3f5-cbac-4b0a-a371-65f2ec0a4f1f,71802-3,Housing status,Patient is homeless,{nominal},102301
4256,4256,2015-02-21,41ec5505-df3f-4e0c-9e4c-2cc7e152031e,931ff5a5-7e2b-4f98-8f26-23aa8584b9af,71802-3,Housing status,Patient is homeless,{nominal},-1
10881,10881,2013-04-16,63b2d7b3-c597-4ede-b477-9b0515799cc8,eea684d8-febf-4099-8533-a56663c87960,71802-3,Housing status,Patient is homeless,{nominal},56714
12216,12216,2010-12-29,bc8af009-ec20-409e-813d-fba6a952a2ab,c2ae4110-443b-4cca-baf4-775ffd4689be,71802-3,Housing status,Patient is homeless,{nominal},-1
16128,16128,2016-11-29,24de5840-c471-4436-93ef-3e5b3e905353,691a9847-3c9e-457b-9ac6-6bf1874898bc,71802-3,Housing status,Patient is homeless,{nominal},47115
16216,16216,2009-07-28,d2e9efc1-431e-4736-8823-e86c16dcb141,c93595bd-c911-4c10-8255-4466b34b4b78,71802-3,Housing status,Patient is homeless,{nominal},-1
17213,17213,2011-11-24,af9cd97a-f11a-469a-8781-84e88ac83774,bbc7bd6d-3897-4f93-a1e0-abfa645ec208,71802-3,Housing status,Patient is homeless,{nominal},108815


In [508]:
emergency['BMIUp'] = emergency['BMI'].map(lambda x: 1 if x > 27 else 0)
emergency['BMIDown'] = emergency['BMI'].map(lambda x: 1 if x < 25 else 0)
emergency['BMIOverall'] = emergency['BMI'].map(lambda x: 1 if x>27 or x<25 else 0)


I am a computer science major and have experience coding in Python and Javascript and using Stata. I am currently in Econometrics and becoming better at analyzing data with statistics and equations (vs. how I usually analyze data by questioning it, comparing it to the world around me, sometimes following my gut, asking for outside opinions...)