# DATASET for Graph

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
#import all datasets
allergiesCSV = pd.read_csv("../data/allergies.csv")
observationsCSV = pd.read_csv("../data/observations.csv")
acsCSV = pd.read_csv("../data/ACS.csv")
careplansCSV = pd.read_csv("../data/careplans.csv")
encountersCSV = pd.read_csv("../data/encounters.csv")
immunizationsCSV = pd.read_csv("../data/immunizations.csv")
procedutesCSV = pd.read_csv("../data/procedures.csv")
zipCSV = pd.read_csv("../data/zip_to_zcta_2019.csv")
conditionsCSV = pd.read_csv("../data/conditions.csv")
medicationsCSV = pd.read_csv("../data/medications.csv")
patientsCSV = pd.read_csv("../data/patients.csv")

In [4]:
#all data from MA, so cleaning for necessary data
massZipCSV = zipCSV[zipCSV.STATE == "MA"]                                 #contains only MA ZCTA 
zipAndIncome = massZipCSV.merge(acsCSV,left_on='ZCTA', right_on='GEO.id2')
zipAndIncome = zipAndIncome.replace(['2,500-',"***",'-','**'],-1)         #necessary for converting to int so it can operated on


In [5]:
zipDrop = ['ZIP_CODE','PO_NAME','STATE','ZIP_TYPE','zip_join_type','GEO.id','GEO.id2','GEO.display-label','HC01_EST_VC13','HC01_MOE_VC13','HC02_EST_VC13','HC02_MOE_VC13','HC03_EST_VC13','HC03_MOE_VC13','HC04_EST_VC13','HC04_MOE_VC13']
zaiMerge = zipAndIncome.copy().drop(zipDrop,axis=1)

#Values are imported as strings. Converted to ints to run regression
zaiMerge['HouseholdIncome'] = zipAndIncome['HC01_EST_VC13'].astype(int)
zaiMerge['HouseholdMOE'] = zipAndIncome['HC01_MOE_VC13'].astype(int)       #zaiMerge is a table with only ZCTA, Household Median Income, & its MOE

#zaiMerge['Family'] = zipAndIncome[['HC02_EST_VC13','HC02_MOE_VC13']].values.tolist()
#zaiMerge['Married'] = zipAndIncome[['HC03_EST_VC13','HC03_MOE_VC13']].values.tolist()
#zaiMerge['Nonfamily'] = zipAndIncome[['HC04_EST_VC13','HC04_MOE_VC13']].values.tolist()  #could be interesting for later use


In [6]:
patientsCSV = patientsCSV.drop(['Unnamed: 0', 'SSN','DRIVERS','PASSPORT','PREFIX','FIRST', 'LAST', 'SUFFIX', 'MAIDEN'],axis=1)    #removed data to clean data and make it visually easier to understand and to ensure privacy
patientsCSV['HOMEZIP'] = patientsCSV['ADDRESS'].map(lambda x:'0'+str(x)[-7:-3])    #lambda fxn grabs zip code. This works since the format an address is fixed

#maps each cell to respective income through matching zipcodes
patientsCSV['INCOME'] = patientsCSV['HOMEZIP'].map(lambda x: -1 if x =='0' or x not in zaiMerge['ZCTA'].unique() else zaiMerge.loc[zaiMerge['ZCTA']== x]['HouseholdIncome'].values[0])
patientsCSV['MOE'] = patientsCSV['HOMEZIP'].map(lambda x: -1 if x =='0' or x not in zaiMerge['ZCTA'].unique() else zaiMerge.loc[zaiMerge['ZCTA']== x]['HouseholdMOE'].values[0])

patientsCSV['DEATHDATE'] = patientsCSV['DEATHDATE'].fillna(0) 



In [26]:
emergency = pd.read_csv("../data/encounters.csv")
emergency = emergency.drop('Unnamed: 0', axis=1)
emergency = emergency[emergency['DATE'].between('2008', '2017')]
emergency = emergency[emergency['CODE']!=308646001]
last = emergency.drop_duplicates("PATIENT",keep='last')
last['DEATHDATE'] = last['PATIENT'].map(lambda x: patientsCSV[patientsCSV["ID"]==x]['DEATHDATE'].values[0])

emergency['RACE'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['RACE'].values[0])

#Maps date of death for each patient to their encounter
emergency['DEATHDATE'] = emergency['ID'].map(lambda x: last.loc[last['ID']==x]['DEATHDATE'].values[0] if x in last['ID'].unique() else 0)

#Create dummy variable to use for regression
emergency['DEATH'] = emergency['DEATHDATE'].map(lambda x: 0 if x == 0 else 1)

#I noticed encounters.CSV was missing a lot of reason descriptions that conditions.CSV had,
#so this for loop fills in missing information
for index,row in emergency.iterrows():
    if isinstance(row['REASONDESCRIPTION'],float):                        #only applied to encounters that are not filled. will not refill a cell. 
        date = row['DATE']
        find = conditionsCSV[conditionsCSV['PATIENT'] == row['PATIENT']]  #find has all of rows that pertain to patient and reason narrows it to the particular encounter
        reason = find[find['START'] == date]['DESCRIPTION']               #assuming someone doesn't visit the hospital in the same day for a different reason
        code = find[find['START'] == date]['CODE']                        #also assuming the hospital records all visit and didn't visit double visits if it occured
        try:
            emergency['REASONDESCRIPTION'][index] = reason.values[0]
            emergency['REASONCODE'][index] = code.values[0]
        except:
            emergency['REASONDESCRIPTION'][index] = np.nan
            emergency['REASONCODE'][index] = np.nan




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.p

In [27]:

#Converted to date_time in order to do calculations on the dates
emergency['DATE']=pd.to_datetime(emergency['DATE'])
emergency['DEATHDATE']=pd.to_datetime(emergency['DEATHDATE'])

#Finds amount of time between emergency visit and death
emergency['DEATHDIFFERENCE'] = emergency['DEATHDATE']-emergency['DATE']
emergency.loc[(emergency['DEATHDIFFERENCE'] < datetime.timedelta(days=0)),'DEATHDIFFERENCE'] = np.nan
#Dummy variable for all patients who passed away 1 year within visit
emergency['YearDeath'] = emergency['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=365) else 0)


In [28]:
LOW_LINE = 62533.6
#Adding patient information for each encounter. For regression and analysis
emergency['AGE'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['AGE'].values[0])
emergency['HOMEZIP'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['HOMEZIP'].values[0])
emergency['INCOME'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['INCOME'].values[0])
emergency['MOE'] = emergency['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['MOE'].values[0])

#Dummy variable for if patient is considered low income. Low income is 80% of state median
emergency['LOWINCOME'] = emergency['INCOME'].map(lambda x: 1 if x < LOW_LINE and x==x else 0)
#Replacing NA was necessary for the above lambda fxn, however NA is necessary for regression since 0 will affect the regression
emergency['INCOME'] = emergency['INCOME'].replace(-1,np.nan)
emergency['SENIOR'] = emergency['AGE'].map(lambda x: 1 if x>=65 else 0)
emergency['GENDER'] = emergency['PATIENT'].map(lambda x: patientsCSV[patientsCSV['ID']==x]['GENDER'].values[0])



In [16]:
#Finds patients who got the flu shot within 6 months of their most recent visit to the hospital. 
emergency['FLU']=None        #1 if patient recieved flu shot w/in 6 months of visit, 0 otherwise
emergency['FLU_DIFF']=None   #difference between time of flu shot and time of visit, if 0 days then they visited the hospital for the flu shot/happened to get it

flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]   #CODE 140 is the code for influenza immunization                                                     
patients = flu['PATIENT'].unique()

for index, row in emergency.iterrows():       
    checked = 0     #keeps track of if the inner for loop found a matching immunization
    patientID = row['PATIENT']
    if patientID in patients:    #if patient is in the flu dataset, they had an immunization
        date = flu[flu['PATIENT']==patientID]['DATE']   #list of all dates they recieved influenza shot
        for d in date:
            d = datetime.datetime.strptime(d, '%Y-%m-%d')
            diff = row['DATE'] - d
            if datetime.timedelta(days=0) <= diff < datetime.timedelta(days=183):    #an influenza shot lasts around 6 months/183 days
                checked = 1
                emergency['FLU'][index]=1
                emergency['FLU_DIFF'][index]=diff    #can help us determine cause of visit if there is none (i.e. if =0 days, then the visit was for the shot)
        if checked == 0:
            emergency['FLU'][index]=0
            emergency['FLU_DIFF'][index]=-1
    else:
            emergency['FLU'][index]=0
            emergency['FLU_DIFF'][index]=-1
            
#I found that a lot of the encounter dates in encounters.csv coincided with 
#dates (found in immunizations.csv) that the patient got a flu shot. They all 
#coincided with Outpatient Encounters without ReasonDescriptions. So I assume 
#the reason they went was for the flu shot and not some hidden sickness/emergency

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [29]:
emergency.to_csv('../data/graph1.csv')

# Extra Analysis 

In [27]:
care = careplansCSV.drop_duplicates("PATIENT", keep='last')
print(care.shape[0], conditionsCSV.shape[0])

1222 7040


In [49]:
#Creates dummy variable for patients who did/did not recieve flu immunization
flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]
patients = flu['PATIENT'].unique()
emergency['FLU'] = emergency['PATIENT'].map(lambda x: 1 if x in patients else 0)
patientsCSV['FLU'] = patientsCSV['ID'].map(lambda x: 1 if x in patients else 0)

In [None]:
#Matches patient BMI to patient
BMI = observationsCSV[observationsCSV['DESCRIPTION']=='Body Mass Index'].astype(int)
patients = BMI['PATIENT'].unique() 
emergency['BMI'] = emergency['PATIENT'].map(lambda x: BMI[BMI['PATIENT']==x]['VALUE'].values[0] if x in patients else -1)
                                                         

In [29]:
#Homeless dummy variable
homeless = observationsCSV[observationsCSV['DESCRIPTION']=='Housing status']
patients = homeless['PATIENT'].unique()
emergency['HOMELESS'] = emergency['PATIENT'].map(lambda x: 1 if x in patients else 0)        #Patients with a 'Housing status' description were all homeless


In [30]:
emergency.to_csv('../data/EmergencyVisitsX.csv')

In [113]:
#Creates dummy variable for patients who did/did not recieve flu immunization
pneu = immunizationsCSV[immunizationsCSV['CODE'].isin([133,33])]
patients = pneu['PATIENT'].unique()
encountersCSV['PNEU'] = encountersCSV['PATIENT'].map(lambda x: 1 if x in patients else 0)
#Creates dummy variable for patients who did/did not recieve flu immunization
flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]
patients = flu['PATIENT'].unique()
encountersCSV['FLU'] = encountersCSV['PATIENT'].map(lambda x: 1 if x in patients else 0)


In [222]:
encountersCSV = pd.read_csv("../data/encounters.csv")

encountersCSV = encountersCSV[encountersCSV['DATE'].between('2008', '2017')]
encountersCSV['DEATHDATE'] = encountersCSV['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['DEATHDATE'].values[0])

encountersCSV['DATE']=pd.to_datetime(encountersCSV['DATE'])
encountersCSV['DEATHDATE']=pd.to_datetime(encountersCSV['DEATHDATE'])

#Finds amount of time between emergency visit and death
encountersCSV['DEATHDIFFERENCE'] = encountersCSV['DEATHDATE']-encountersCSV['DATE']
encountersCSV.loc[(encountersCSV['DEATHDIFFERENCE'] < datetime.timedelta(days=0)),'DEATHDIFFERENCE'] = np.nan

encountersCSV['DEATH'] = encountersCSV['DEATHDIFFERENCE'].map(lambda x: 0 if x == datetime.timedelta(days=0) or x!=x else 1)

fatalEncounters = encountersCSV[encountersCSV['DEATH']==1]

fatalEncounters['AGE'] = fatalEncounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['AGE'].values[0])
fatalEncounters['RACE']=fatalEncounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['RACE'].values[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [223]:
for index,row in fatalEncounters.iterrows():
    if isinstance(row['REASONDESCRIPTION'],float):                        #only applied to encounters that are not filled. will not refill a cell. 
        date = row['DATE']
        find = conditionsCSV[conditionsCSV['PATIENT'] == row['PATIENT']]  #find has all of rows that pertain to patient and reason narrows it to the particular encounter
        reason = find[find['START'] == date]['DESCRIPTION']               #assuming someone doesn't visit the hospital in the same day for a different reason
        code = find[find['START'] == date]['CODE']                        #also assuming the hospital records all visit and didn't visit double visits if it occured
        try:
            fatalEncounters['REASONDESCRIPTION'][index] = reason.values[0]
            fatalEncounters['REASONCODE'][index] = code.values[0]
        except:
            fatalEncounters['REASONDESCRIPTION'][index] = np.nan
            fatalEncounters['REASONCODE'][index] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [224]:
#Matches patient BMI to patient
BMI = observationsCSV[observationsCSV['DESCRIPTION']=='Body Mass Index']
patients = BMI['PATIENT'].unique() 
fatalEncounters['BMI'] = fatalEncounters['PATIENT'].map(lambda x: float(BMI[BMI['PATIENT']==x]['VALUE'].values[0]) if x in patients else -1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [None]:
#Encounters is a dataset for all the most recent encounters

In [239]:
encounters = pd.read_csv("../data/encounters.csv")
encounters = encounters[encounters['DATE'].between('2008', '2017')]
encounters = encounters[encounters['DESCRIPTION']!='Death Certification']
encounters = encounters.drop_duplicates('PATIENT',keep='last')
encounters['DEATHDATE'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['DEATHDATE'].values[0])


In [240]:
#Converted to date_time in order to do calculations on the dates
encounters['DATE']=pd.to_datetime(encounters['DATE'])
encounters['DEATHDATE']=pd.to_datetime(encounters['DEATHDATE'])

#Finds amount of time between emergency visit and death
encounters['DEATHDIFFERENCE'] = encounters['DEATHDATE']-encounters['DATE']

#Dummy variable for all patients who passed away 1 year within visit
encounters['YearDeath'] = encounters['DEATHDIFFERENCE'].map(lambda x: 1 if datetime.timedelta(days=0)<x<=datetime.timedelta(days=365) else 0)

encounters['RACE'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['RACE'].values[0])

#Create dummy variables for race for regression
encounters['MINORITY'] = encounters['RACE'].map(lambda x: 1 if x == 'black or african american' or x=='black' or x == 'hispanic' or x == 'asian' else 0)


#Maps date of death for each patient to their encounter
encounters['DEATHDATE'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['DEATHDATE'].values[0])

#Create dummy variable to use for regression
encounters['DEATH'] = encounters['DEATHDIFFERENCE'].map(lambda x: 0 if x == datetime.timedelta(days=0) or x!=x else 1)

encounters.loc[(encounters['DEATHDIFFERENCE'] < datetime.timedelta(days=0)),'DEATHDIFFERENCE'] = np.nan
#I noticed encounters.CSV was missing a lot of reason descriptions that conditions.CSV had,
#so this for loop fills in missing information
for index,row in encounters.iterrows():
    if isinstance(row['REASONDESCRIPTION'],float):                        #only applied to encounters that are not filled. will not refill a cell. 
        date = row['DATE']
        find = conditionsCSV[conditionsCSV['PATIENT'] == row['PATIENT']]  #find has all of rows that pertain to patient and reason narrows it to the particular encounter
        reason = find[find['START'] == date]['DESCRIPTION']               #assuming someone doesn't visit the hospital in the same day for a different reason
        code = find[find['START'] == date]['CODE']                        #also assuming the hospital records all visit and didn't visit double visits if it occured
        try:
            encounters['REASONDESCRIPTION'][index] = reason.values[0]
            encounters['REASONCODE'][index] = code.values[0]
        except:
            encounters['REASONDESCRIPTION'][index] = np.nan
            encounters['REASONCODE'][index] = np.nan
            
LOW_LINE = 62533.6
#Adding patient information for each encounter. For regression and analysis
encounters['AGE'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['AGE'].values[0])
encounters['HOMEZIP'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['HOMEZIP'].values[0])
encounters['INCOME'] = encounters['PATIENT'].map(lambda x: patientsCSV.loc[patientsCSV['ID']==x]['INCOME'].values[0])

#Dummy variable for if patient is considered low income. Low income is 80% of state median
encounters['LOWINCOME'] = encounters['INCOME'].map(lambda x: 1 if x < LOW_LINE and x==x else 0)
#Replacing NA was necessary for the above lambda fxn, however NA is necessary for regression since 0 will affect the regression
encounters['INCOME'] = encounters['INCOME'].replace(-1,np.nan)
encounters['SENIOR'] = encounters['AGE'].map(lambda x: 1 if x>=65 else 0)

#The dataset encounters now only has the latest hospital visit of every patient. Regardless of emergency or not. 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [241]:
#Matches patient BMI to patient
BMI = observationsCSV[observationsCSV['DESCRIPTION']=='Body Mass Index']
patients = BMI['PATIENT'].unique() 
encounters['BMI'] = encounters['PATIENT'].map(lambda x: float(BMI[BMI['PATIENT']==x]['VALUE'].values[0]) if x in patients else -1)


In [289]:
#Finds patients who got the flu shot within 6 months of their most recent visit to the hospital. 
encounters['FLU']=None
encounters['FLU_DIFF']=None

flu = immunizationsCSV[immunizationsCSV['CODE'] == 140]

patients = flu['PATIENT'].unique()

for index, row in encounters.iterrows():       
    checked = 0
    patientID = row['PATIENT']
    if patientID in patients:    #if patient is in the immunizations dataset, they had an immunization
        date = flu[flu['PATIENT']==patientID]['DATE']
        for d in date:
            d = datetime.datetime.strptime(d, '%Y-%m-%d')
            diff = row['DATE'] - d
            if datetime.timedelta(days=0) <= diff < datetime.timedelta(days=183):
                checked = 1
                encounters['FLU'][index]=1
                encounters['FLU_DIFF'][index]=diff
        if checked == 0:
            encounters['FLU'][index]=0
            encounters['FLU_DIFF'][index]=-1
    else:
            encounters['FLU'][index]=0
            encounters['FLU_DIFF'][index]=-1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [292]:
g = encounters[encounters['SENIOR']==1]
g[g['DEATHDIFFERENCE']<=1]

Unnamed: 0.1,Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,...,MINORITY,DEATH,AGE,HOMEZIP,INCOME,LOWINCOME,SENIOR,BMI,FLU,FLU_DIFF
53,53,2d2c4fc1-574c-4fbe-9fbc-bc50779d96f7,2011-04-07,31410948-38be-4990-be5e-a47ab44f33a1,185349003,Outpatient Encounter,,,3/25/12,353 days,...,0,1,96,01864,119933.0,0,1,30.67,1,0 days 00:00:00
106,106,c3ecf43e-51b9-4783-b303-4cf2a2307fc7,2016-02-05,2c884d0f-62a1-4371-becc-36a98cdc4f52,185349003,Outpatient Encounter,,,8/22/16,199 days,...,1,1,96,01604,50426.0,1,1,22.79,1,0 days 00:00:00
258,258,2217aeaa-3074-40e8-9ec3-9dbebfe9d098,2011-05-28,6166101c-e02f-4230-b6c1-533865470674,185349003,Outpatient Encounter,,,10/14/11,139 days,...,0,1,96,01104,28590.0,1,1,26.63,1,0 days 00:00:00
404,404,11f6cead-7101-41ef-bb49-fb09e487835e,2016-11-25,48074b70-4db4-4ab0-b9e8-361bd2ba6216,316744009,Office Visit,26929004.0,Alzheimer's disease (disorder),9/6/17,285 days,...,0,1,82,02714,,1,1,25.06,1,55 days 00:00:00
602,602,4992f3e2-3ca3-4468-b00a-5b1689e22d9e,2008-10-27,c7972562-7f73-49e1-a70e-79ef22eeab42,185345009,Encounter for symptom,43878008.0,Streptococcal sore throat (disorder),2/23/09,119 days,...,1,1,91,02038,108815.0,0,1,22.95,1,108 days 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17583,17583,adeff2e0-fbd0-4a91-ac7c-39c89eaccf6d,2010-02-24,b9b6bf1d-45f7-4d7e-bdeb-e8e5f61c6075,185349003,Outpatient Encounter,,,5/28/10,93 days,...,1,1,67,02038,108815.0,0,1,39.20,1,0 days 00:00:00
18671,18671,a032e928-49b7-4976-8ce1-26e94341846d,2016-11-27,aaa4c718-2f48-4c13-9ad0-d287cf280824,50849002,Emergency Encounter,,,10/22/17,329 days,...,1,1,74,02746,28351.0,1,1,31.93,0,-1
19024,19024,d086cedf-817c-4918-9a4b-699b815cc9fb,2016-11-11,26b5fb68-32a2-47fe-a1e4-12db200d5b68,50849002,Emergency room admission,,,1/25/17,75 days,...,1,1,90,02171,73280.0,0,1,30.72,1,175 days 00:00:00
19144,19144,21f555bd-d264-4f01-ab9f-554c768f2d83,2015-12-15,093c5b43-9b43-40a4-8cb6-ba2292cba7c0,185349003,Outpatient Encounter,,,3/6/16,82 days,...,1,1,98,02141,73322.0,0,1,25.18,1,0 days 00:00:00


In [297]:
encounters[encounters['REASONDESCRIPTION']=='Pneumonia']

Unnamed: 0.1,Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,...,MINORITY,DEATH,AGE,HOMEZIP,INCOME,LOWINCOME,SENIOR,BMI,FLU,FLU_DIFF
6063,6063,e7d1b9ab-a5c4-4d0b-ac86-f6582496195e,2014-01-25,cdbe1927-b954-4f37-9294-ac0ca277d147,34285007,Hospital admission,233604007.0,Pneumonia,2/6/14,12 days,...,1,1,92,2283,,1,1,25.15,1,121 days 00:00:00
7120,7120,977f504f-1ec5-45c7-aa16-b0010a62745f,2015-02-02,8b3cf0c5-affd-4cbb-a397-e63484ba4d47,34285007,Hospital admission,233604007.0,Pneumonia,2/15/15,13 days,...,1,1,78,2379,80709.0,0,1,37.09,1,101 days 00:00:00
11942,11942,beb50d63-b424-4b33-8315-9c1e4a7ad479,2016-07-02,e6e63adb-9c2a-4f7a-ade9-3b89ff862a03,34285007,Hospital admission,233604007.0,Pneumonia,7/10/16,8 days,...,1,1,80,1844,72664.0,0,1,35.13,0,-1
13148,13148,2925ea65-0f13-41e7-9e5e-d6165167f831,2015-12-10,af7c4d94-3524-4c7b-b8c0-fb774c9e5b6d,34285007,Hospital admission,233604007.0,Pneumonia,12/21/15,11 days,...,1,1,84,2334,,1,1,35.71,0,-1


In [301]:
d = encounters[encounters['DEATH']==1]
locations = d.groupby('HOMEZIP').count()
locations.sort_values('PATIENT').tail(10)

Unnamed: 0_level_0,Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,...,RACE,MINORITY,DEATH,AGE,INCOME,LOWINCOME,SENIOR,BMI,FLU,FLU_DIFF
HOMEZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2783,2,2,2,2,2,2,0,0,2,2,...,2,2,2,2,0,2,2,2,2,2
2171,2,2,2,2,2,2,0,0,2,2,...,2,2,2,2,2,2,2,2,2,2
1833,2,2,2,2,2,2,0,0,2,2,...,2,2,2,2,2,2,2,2,2,2
1826,2,2,2,2,2,2,0,0,2,2,...,2,2,2,2,2,2,2,2,2,2
2038,2,2,2,2,2,2,1,1,2,2,...,2,2,2,2,2,2,2,2,2,2
1203,2,2,2,2,2,2,1,1,2,2,...,2,2,2,2,0,2,2,2,2,2
2703,3,3,3,3,3,3,1,1,3,3,...,3,3,3,3,3,3,3,3,3,3
2151,3,3,3,3,3,3,0,0,3,3,...,3,3,3,3,3,3,3,3,3,3
2301,3,3,3,3,3,3,1,1,3,3,...,3,3,3,3,3,3,3,3,3,3
2081,4,4,4,4,4,4,1,1,4,4,...,4,4,4,4,4,4,4,4,4,4


In [307]:
m = encounters[encounters['YearDeath']==1]
p = m[m['MINORITY']==1]
o = p[p['SENIOR']==1]
o.groupby('HOMEZIP').count()

Unnamed: 0_level_0,Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,...,RACE,MINORITY,DEATH,AGE,INCOME,LOWINCOME,SENIOR,BMI,FLU,FLU_DIFF
HOMEZIP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1027,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1056,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1057,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1082,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1128,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1226,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1266,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1351,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1420,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
1505,1,1,1,1,1,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1


In [310]:
len(allergiesCSV['PATIENT'].unique())

157

In [315]:
al = allergiesCSV.drop_duplicates('PATIENT', keep='last')
al['RACE'] = al['PATIENT'].map(lambda x: patientsCSV[patientsCSV['ID']==x]['RACE'].values[0])





al.groupby('RACE').count()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
asian,10,10,1,10,10,10,10
black,12,12,1,12,12,12,12
black or african american,4,4,0,4,4,4,4
hispanic,113,113,4,113,113,113,113
white,18,18,0,18,18,18,18


In [317]:
encountersCSV['REASONDESCRIPTION'].unique()

array([nan, 'Acute bronchitis (disorder)', 'Cystitis', 'Normal pregnancy',
       'Acute viral pharyngitis (disorder)', 'Fracture of rib',
       'Viral sinusitis (disorder)', 'Fracture of forearm',
       'Child attention deficit disorder', 'Sinusitis (disorder)',
       'Fracture of clavicle', 'Concussion with loss of consciousness',
       'Fetus with chromosomal abnormality', 'Otitis media',
       'Closed fracture of hip', 'Streptococcal sore throat (disorder)',
       'Acute bacterial sinusitis (disorder)',
       'Concussion with no loss of consciousness',
       "Alzheimer's disease (disorder)", 'Asthma',
       'Suspected lung cancer (situation)',
       'Small cell carcinoma of lung (disorder)',
       'Primary small cell malignant neoplasm of lung  TNM stage 4 (disorder)',
       'Injury of tendon of the rotator cuff of shoulder',
       'Fracture subluxation of wrist',
       'Localized  primary osteoarthritis of the hand',
       'Osteoarthritis of knee', 'Primary fibromya

In [320]:
encountersCSV[encountersCSV['REASONDESCRIPTION']=='Tubal pregnancy']







Unnamed: 0.1,Unnamed: 0,ID,DATE,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION,DEATHDATE,DEATHDIFFERENCE,DEATH
10789,10789,e78c5045-46e2-4936-9aa1-946756b15b7e,2008-07-13,571853a3-4710-469a-9cad-57f0e5d2f977,305408004,Admission to surgical department,79586000.0,Tubal pregnancy,1970-01-01,NaT,0
10790,10790,837079af-5982-4c4b-8076-b0acb5a82153,2008-07-22,571853a3-4710-469a-9cad-57f0e5d2f977,424619006,Prenatal visit,79586000.0,Tubal pregnancy,1970-01-01,NaT,0
