<b>Project</b>: Population segmentation and transition probability estimation using data on health and health-related social service needs from the US Health and Retirement Study <br>
<b>Project section</b>: Population segmentation <br>
<b>Python version</b>: 3.9.7 <br>
<b>File name</b>: 03_Complicating_Factors.ipynb <br>
<b>Data required</b>: data_GI.csv <br>
<b>Outcome</b>: Allocates a Complicating Factor (CF) status to each respondent <br>
<b>Author</b>: Lize Duminy<br>
<b>Date</b>: 2023.03.19 

# Instructions for use

1. This script requires the dataset __data_GI.csv__, generated by running __02_Global_Impressions_Segmentation.ipynb__.
2. To execute this script, replace the filepath of the variable __global_path__ in the code below (currently _C:/Users/LizeDuminy/data/HRS/data/_) with the filepath of your designated folder for raw data in section __1.1. USER INPUT REQUIRED: Replace this filepath with the filepath of your designated folder for raw data__ below. 

# 1. Preparation 

#### Import packages

In [1]:
# Import packages
import pandas as pd # for dealing with dataframes
import numpy as np # for numerical calculations
import os
import os.path

### 1.1. USER INPUT REQUIRED: Replace this filepath with the filepath of your designated folder for data

In [2]:
global_path=os.path.join("C:/Users/LizeDuminy/data/HRS/data/")
print(global_path)

C:/Users/LizeDuminy/data/HRS/data/


### 1.2. Load data

#### Load "data_GI.csv"

In [3]:
filepath = os.path.join(global_path , "data_GI.csv")
data = pd.read_csv(filepath)
print(data.shape)

(60376, 158)


# 2. Functional Assessment

__Output variable:__

    fun_assess  #functional assessment
 
__Data structure of data.cond_nlt:__

     1  Has caregiver dependance
     0  No caregiver dependance
    -3  Unknown
    -6  Deceased

In [4]:
data['fun_assess'] = -3

#Identify people with no adl or iadl deficits
data.loc[(data.adl6h==0), 'fun_assess'] = 0
data.loc[(data.iadl5a==0), 'fun_assess'] = 0

#Identify people with and adl or iadl deficit
data.loc[(data.adl6h>0)|(data.iadl5a>0), 'fun_assess'] = 1

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'fun_assess'] = -6

print("Summary statistics of data.fun_assess:")
print(data.fun_assess.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.fun_assess:
-6     8116
 0    43661
 1     8599
Name: fun_assess, dtype: int64
See data structure above code


# 3. Nursing and rehabilitation type skilled task need

__Output variable:__

    nrs_rhab  #nursing and rehabilitation skills need
 
__Data structure of data.nrs_rhab:__

     1  Has need
     0  does not have
    -3  Unknown
    -6  Deceased

In [5]:
#define variable as unknown
data['nrs_rhab'] = -3

#Identify people with no home care needs
data.loc[(data.homcar=="0.no"), 'nrs_rhab'] = 0

#Identify people with nursing type or rehabilitation skilled task needs
data.loc[(data.homcar=="1.yes"), 'nrs_rhab'] = 1

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'nrs_rhab'] = -6

print("Summary statistics of data.nrs_rhab:")
print(data.nrs_rhab.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.nrs_rhab:
-6     8116
-3       79
 0    48108
 1     4073
Name: nrs_rhab, dtype: int64
See data structure above code


# 4. Disruptive behavioural issues

__Output variable:__

    disrupt  #disruptive behavioural issues
 
__Data structure of data.disrupt:__

     1  Has issues
     0  Does not have issues
    -3  Unknown
    -6  Deceased

In [6]:
#define variable as unknown
data["disrupt"] = -3

#Identify people with no disruptive behavioral issues
data.loc[(data.psyche=="0.no"), 'disrupt'] = 0

#Identify people with disruptive behavioral issues
data.loc[(data.psyche=="1.yes"), 'disrupt'] = 1

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'disrupt'] = -6

print("Summary statistics of data.disrupt:")
print(data.disrupt.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.disrupt:
-6     8116
 0    43000
 1     9260
Name: disrupt, dtype: int64
See data structure above code


# 5. Frequent transitions between inpatient and outpatient care

__Output variable:__

    freq_trans  #frequent transitions
 
__Data structure of data.freq_trans:__

     1  Experiences frequent transitions between care venues
     0  Does not experience frequent transitions
    -3  Unknown
    -6  Deceased

In [7]:
#Isolate the earliest wave used for cut-off
cut_hosp = data.loc[data['wave']==8, 'hsptim']

#Calculate cut-off point: 95th percentile
cut_hosp = cut_hosp.quantile([.95])
cut_hosp = cut_hosp.reset_index(level=0)

#Rename column with cut-off point
cut_hosp = cut_hosp.rename(columns={'hsptim': 'cut_hsptim'})

#Add the gender-and-height-specific cut-off point to each row 
data['cut_hosp'] = int(cut_hosp['cut_hsptim'])

#define variable as unknown
data['freq_trans'] = -3

#Identify people who do not transition frequently
data.loc[(data.hsptim <= data.cut_hosp), 'freq_trans'] = 0

#Identify people who do transition frequently
data.loc[(data.hsptim > data.cut_hosp), 'freq_trans'] = 1

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'freq_trans'] = -6

print("Summary statistics of data.freq_trans:")
print(data.freq_trans.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.freq_trans:
-6     8116
-3      222
 0    49799
 1     2239
Name: freq_trans, dtype: int64
See data structure above code


# 6. Polypharmacy

__Output variable:__

    polypharm  #5 or more medications prescribed per day
 
__Data structure of data.polypharm:__

     1  Has 5 or more medications prescribed per day
     0  Less than 5 medications prescribed
    -3  Unknown
    -6  Deceased

In [8]:
#Identify all polypharmacy variables
ppharm = ["C006", 
          "C011",
          "C032",
          "C037",
          "C042",
          "C046",
          "C050",
          "C282",
          "C060",
          "N360",
          "N361",
          "N362",
          "N363",
          "N364",
          "N365",
          "N283"
         ]

#Calculate the single drugs taken by respondent
data["ppharm_n"] = (data[ppharm]==1).sum(axis=1,skipna=True)

#define variable as unknown
data['polypharm'] = -3

#Identify people who do not take more than five prescription categories
data.loc[(data.ppharm_n < 5), 'polypharm'] = 0

#Identify people who do transition frequently
data.loc[(data.ppharm_n >= 5), 'polypharm'] = 1

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'polypharm'] = -6

print("Summary statistics of data.polypharm:")
print(data.polypharm.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.polypharm:
-6     8116
 0    43330
 1     8930
Name: polypharm, dtype: int64
See data structure above code


# 8. Assign Complicating Factors

### 8.1. Count the number of complications per respondent

__Output variable:__

    CF_count  #Number of complicating factors per respondent
 
__Data structure of data.CF_count:__

     5  5 Complicating Factors
     4  4 Complicating Factors
     3  3 Complicating Factors
     2  2 Complicating Factors
     1  1 Complicating Factor
     0  0 Complicating Factors

In [9]:
data["CF_count"] = (data[["fun_assess", 
      "nrs_rhab", 
      "disrupt",
      "freq_trans",
      "polypharm"]] == 1).sum(axis=1)

print("Summary statistics of data.CF_count:")
print(data.CF_count.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.CF_count:
0    40135
1    11737
2     5294
3     2263
4      748
5      199
Name: CF_count, dtype: int64
See data structure above code


### 8.2. Count the number of factors where the complication-level is unknown per individual


__Output variable:__

    CF_unknowns  #Number of unknown complicating factors per respondent
 
__Data structure of data.CF_unknowns:__

     5  5 Unknown complicating Factors
     4  4 Unknown complicating Factors
     3  3 Unknown complicating Factors
     2  2 Unknown complicating Factors
     1  1 Unknown complicating Factor
     0  0 Unknown complicating Factors

In [10]:
data["CF_unknowns"] = (data[["fun_assess", 
      "nrs_rhab", 
      "disrupt",
      "freq_trans",
      "polypharm"]] == -3).sum(axis=1)

print("Summary statistics of data.CF_unknowns:")
print(data.CF_unknowns.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.CF_unknowns:
0    60139
1      173
2       64
Name: CF_unknowns, dtype: int64
See data structure above code


 ### 8.3. Define Complicating Factor status

__Output variable:__

    CF  #Complicating Factor status
 
__Data structure of data.CF:__

     1  Has at least one complicating factor
     0  No complicating factors
    -3  Unknown
    -6  Deceased

In [11]:
#Assign each respondent an unknown CF status
data["CF"] = -3

#If respondent has one or more confirmed CF, assign 1
data.loc[data["CF_count"]>=1, "CF"]  = 1

#If respondent has no confirmed CF AND the status for all CFs are known, assign 0
data.loc[(data["CF_count"]==0)&
         (data["CF_unknowns"]==0), "CF"]  = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'CF'] = -6

#Show distribution of variable
print()
print("Summary statistics of data.CF_unknowns:")
print(data.CF.value_counts().sort_index())
print("See data structure above code")


Summary statistics of data.CF_unknowns:
-6     8116
-3       87
 0    31932
 1    20241
Name: CF, dtype: int64
See data structure above code


# 9. GI and CF Segmentation Statistics

In [12]:
print("GI segments of respondents with no CFs in wave 8")
print(data[(data.wave==8)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 8")
print(data[(data.wave==8)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 8")
print(data[(data.wave==8)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 8
-3.0       6
 1.0     730
 2.0    1201
 3.0    2174
 4.0     351
 5.0     376
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 8
-3.0      3
 1.0     86
 2.0    184
 3.0    938
 4.0    262
 5.0    851
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 8
14


In [13]:
print("GI segments of respondents with no CFs in wave 9")
print(data[(data.wave==9)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 9")
print(data[(data.wave==9)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 9")
print(data[(data.wave==9)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 9
-3.0       2
 1.0     545
 2.0     962
 3.0    1968
 4.0     297
 5.0     356
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 9
1.0     68
2.0    168
3.0    927
4.0    281
5.0    840
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 9
9


In [14]:
print("GI segments of respondents with no CFs in wave 10")
print(data[(data.wave==10)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 10")
print(data[(data.wave==10)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 10")
print(data[(data.wave==10)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 10
-3.0       6
 1.0    1046
 2.0    1304
 3.0    2272
 4.0     391
 5.0     360
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 10
1.0     100
2.0     259
3.0    1358
4.0     352
5.0    1005
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 10
18


In [15]:
print("GI segments of respondents with no CFs in wave 11")
print(data[(data.wave==11)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 11")
print(data[(data.wave==11)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 11")
print(data[(data.wave==11)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 11
-3.0       5
 1.0     772
 2.0    1142
 3.0    1968
 4.0     582
 5.0     272
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 11
-3.0       2
 1.0      86
 2.0     236
 3.0    1348
 4.0     459
 5.0    1059
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 11
18


In [16]:
print("GI segments of respondents with no CFs in wave 12")
print(data[(data.wave==12)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 12")
print(data[(data.wave==12)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 12")
print(data[(data.wave==12)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 12
-3.0       2
 1.0     667
 2.0     960
 3.0    1914
 4.0     505
 5.0     275
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 12
-3.0       4
 1.0      85
 2.0     243
 3.0    1300
 4.0     441
 5.0    1064
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 12
20


In [17]:
print("GI segments of respondents with no CFs in wave 13")
print(data[(data.wave==13)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 13")
print(data[(data.wave==13)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 13")
print(data[(data.wave==13)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 13
-3.0       4
 1.0     863
 2.0     971
 3.0    2003
 4.0     447
 5.0     259
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 13
1.0     114
2.0     248
3.0    1499
4.0     398
5.0    1039
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 13
28


In [18]:
print("GI segments of respondents with no CFs in wave 14")
print(data[(data.wave==14)&(data.CF==0)].GI.value_counts().sort_index())
print(" ")
print("GI segments of respondents with CFs in wave 14")
print(data[(data.wave==14)&(data.CF==1)].GI.value_counts().sort_index())
print(" ")
print("Unknown GI segment or CF status in wave 14")
print(data[(data.wave==14)&((data.CF==-3)|(data.GI==-3))].hhidpn.value_counts().sum())

GI segments of respondents with no CFs in wave 14
-3.0       3
 1.0     678
 2.0     859
 3.0    1805
 4.0     403
 5.0     226
Name: GI, dtype: int64
 
GI segments of respondents with CFs in wave 14
1.0      97
2.0     228
3.0    1283
4.0     405
5.0     921
Name: GI, dtype: int64
 
Unknown GI segment or CF status in wave 14
17


# 10. Export to csv

#### Write "data_GI_CF.csv"

Write dataset to be used in 04_Survival_Analysis.ipynb

In [19]:
filepath_data_GI_CF = os.path.join(global_path, "data_GI_CF.csv")

data.to_csv(filepath_data_GI_CF, index=False)