<b>Project</b>: Population segmentation and transition probability estimation using data on health and health-related social service needs from the US Health and Retirement Study <br>
<b>Project section</b>: Population segmentation <br>

<b>Version</b>: ???Python 3.68.01 <br>

<b>File name</b>: 02_Global_Impressions_Segmentation.ipynb <br>
<b>Data required</b>: dataHRS.csv <br>
<b>Outcome</b>: Allocates a Global Impressions (GI) segment to each respondent <br>

<b>Author</b>: Lize Duminy<br>
<b>Date</b>: 2023.03.19 

# Instructions for use

1. This script requires the dataset __dataHRS.csv__, generated by running __01_Extraction.ipynb__.
2. To execute this script, replace the filepath of the variable __global_path__ in the code below (currently _C:/Users/LizeDuminy/data/HRS/data/_) with the filepath of your designated folder for raw data in section __1.1. USER INPUT REQUIRED: Replace this filepath with the filepath of your designated folder for raw data__ below. 

# 1. Preparation

#### Import packages

In [1]:
#Import packages
import pandas as pd #for dealing with dataframes
import numpy as np  #for numerical calculations
import os
import os.path

#import matplotlib.pyplot as plt # for plotting
#import seaborn as sns # for plotting

### 1.1. USER INPUT REQUIRED: Replace this filepath with the filepath of your designated folder for data

In [2]:
global_path=os.path.join("C:/Users/LizeDuminy/data/HRS/data/")
print(global_path)

C:/Users/LizeDuminy/data/HRS/data/


#### Load "dataHRS.csv"

In [3]:
filepath = os.path.join(global_path , "dataHRS.csv")
data = pd.read_csv(filepath)
print(data.shape)

(60376, 98)


# 2. Construct indicators used in global impressions segmentation 

### 2.1 Non-life-threatening chronic conditions

__Diagnosis variables:__

    'hibp',    #High blood pressure
    'diab',    #Diabetes
    'arthr',   #Arthritis or rheumatism

__Data structure of diagnosis variables:__

    0.no
    1.yes
    3.disp prev record and has cond
    4.disp prev record and no cond

__Output variable:__

    cond_nlt  #non-life-threateing chronic condition
 
__Data structure of data.cond_nlt:__

     1  Has non-life-threateing chronic condition
     0  Does not have non-life-threateing chronic condition
    -3  Unknown
    -6  Deceased

In [4]:
#Evaluate if the answer was "1.yes" to any condition 
data['x1'] = (data.hibp=="1.yes")|(data.diab=="1.yes")|(data.arthr=="1.yes")

#Evaluate if the answer was "3.disp prev record and has cond" to any condition 
data['x3'] = (data.hibp=="3.disp prev record and has cond")|(data['diab']=="3.disp prev record and has cond")|(data['arthr']=="3.disp prev record and has cond")

#Evaluate if the answer was either "0.no" or "4.disp prev record and no cond" per condition 
data['y_hibp'] = (data.hibp=="0.no")|(data.hibp=="4.disp prev record and no cond")
data['y_diab'] = (data['diab']=="0.no")|(data['diab']=="4.disp prev record and no cond")
data['y_arthr'] = (data['arthr']=="0.no")|(data['arthr']=="4.disp prev record and no cond")

#Evaluate if individual answered "no" to all three questions
data['y'] = data['y_hibp'] & data['y_diab'] & data['y_arthr']

#Initialize variable with all entries having an unknown non-life-threatening chronic disease status
data['cond_nlt'] = -3

#Assign non-life-threatening chronic condition if participant answered affirmative to any chronic condition
data.loc[(data.x1|data.x3), 'cond_nlt'] = 1

#Assign non-life-threatening chronic condition if participant reported no to all potential non-life-threatening conditions
data.loc[data.y, 'cond_nlt'] = 0

#Identifie deceased individuals
data.loc[data['alive'] == 5, 'cond_nlt'] = -6

print("Summary statistics of data.cond_nlt:")
print(data.cond_nlt.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.cond_nlt:
-6     8116
-3       29
 0     9169
 1    43062
Name: cond_nlt, dtype: int64
See data structure above code


### 2.2. Life-threatening chronic conditions

__Diagnosis variables:__

    'cancr',   #Cancer (excl. skin cancer)
    'lung',    #Chronic lung disease such as chronic bronchitis or emphysema
    'heart',   #Heart attack, coronary heart disease, angina, congestive heart failure, or other heart problems
    'strok',   #Stroke or cerebral vascular disease

__Data structure of diagnosis variables:__

    0.no
    1.yes
    2.tia/possible strok
    3.disp prev record and has cond
    4.disp prev record and no cond
    
__Output variable:__

    cond_lt  #life-threateing chronic condition
 
__Data structure of cond_lt:__

     1  Has life-threateing chronic condition
     0  Does not have life-threateing chronic condition
    -3  Unknown
    -6  Deceased

In [5]:
#Evaluate if the answer was "1.yes" to any condition 
data['x1'] = (data.cancr=="1.yes")|(data.lung=="1.yes")|(data.heart=="1.yes")|(data.strok=="1.yes")|(data.strok=="2.tia/possible strok")

#Evaluate if the answer was "3.disp prev record and has cond" to any condition 
data['x3'] = (data.cancr=="3.disp prev record and has cond")|(data.lung=="3.disp prev record and has cond")|(data.heart=="3.disp prev record and has cond")|(data.strok=="3.disp prev record and has cond")

#Evaluate if the answer was either "0.no" or "4.disp prev record and no cond" per condition 
data['y_cancr'] = (data.cancr=="0.no")|(data.cancr=="4.disp prev record and no cond")
data['y_lung'] = (data.lung=="0.no")|(data.lung=="4.disp prev record and no cond")
data['y_heart'] = (data.heart=="0.no")|(data.heart=="4.disp prev record and no cond")
data['y_strok'] = (data.strok=="0.no")|(data.strok=="4.disp prev record and no cond")

#Evaluate if individual answered "no" to all four questions
data['y'] = data['y_cancr'] & data['y_lung'] & data['y_heart'] & data['y_strok']

#Initialize variable with all entries having an unknown non-life-threatening chronic disease status
data['cond_lt'] = -3

#Assign non-life-threatening chronic condition if participant answered affirmative to any chronic condition
data.loc[(data.x1|data.x3), 'cond_lt'] = 1

#Assign non-life-threatening chronic condition if participant reported no to all potential non-life-threatening conditions
data.loc[data.y, 'cond_lt'] = 0

#Identifie deceased individuals
data.loc[data['alive'] == 5, 'cond_lt'] = -6

print("Summary statistics of data.cond_lt:")
print(data.cond_lt.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.cond_lt:
-6     8116
-3      117
 0    29980
 1    22163
Name: cond_lt, dtype: int64
See data structure above code


### 2.3. Depression

__Output variable:__

    depr  #depression status

__Data structure of depr:__

     1  Depressed
     0  Not depressed
    -3  Unknown
    -6  Deceased

In [6]:
#Initialize variable with all entries having an unknown depression status
data['depr'] = -3

#Individual is considered depressed with a CES-D of 4 or higher
data.loc[(data.cesd >= 4), 'depr'] = 1

#Individual is considered not depressed with a CES-D of less than 4
data.loc[(data.cesd < 4), 'depr'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'depr'] = -6

print("Summary statistics of data.depr:")
print(data.depr.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.depr:
-6     8116
-3        4
 0    44442
 1     7814
Name: depr, dtype: int64
See data structure above code


### 2.4. Self-reported health

__Output variable:__

    sr_health  #self-reported health

__Data structure of sr_health:__

     1  "1.excellent"/"2.very good"
     2  "3.good"
     3  "4.fair"/"5.poor"
    -3  Unknown
    -6  Deceased

In [7]:
#Qualification for 1: "1.excellent"/"2.very good"
data['sr_1'] = (data.shlt=="1.excellent")|(data.shlt=="2.very good")

#Qualification for 2: "3.good"
data['sr_2'] = (data.shlt=="3.good")

#Qualification for 3: "4.fair"/"5.poor"
data['sr_3'] = (data.shlt=="4.fair")|(data.shlt=="5.poor")

#Initialize variable with all entries having an unknown depression status
data['sr_health'] = -3

#Assign 1 to "1.excellent"/"2.very good"
data.loc[data.sr_1, 'sr_health'] = 1

#Assign 2 to "3.good"
data.loc[data.sr_2, 'sr_health'] = 2

#Assign 3 to "4.fair"/"5.poor"
data.loc[data.sr_3, 'sr_health'] = 3

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'sr_health'] = -6

print("Summary statistics of data.sr_health:")
print(data.sr_health.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.sr_health:
-6     8116
-3       40
 1    20149
 2    17280
 3    14791
Name: sr_health, dtype: int64
See data structure above code


### 2.5.  Cognitive Impairment

__Performance-based cognition tests:__
    
    'D174',    #Immediate word recall
    'D184',    #Delayed word recall
    'D142',    #Serial 7 test - 1
    'D143',    #Serial 7 test - 2
    'D144',    #Serial 7 test - 3
    'D145',    #Serial 7 test - 4
    'D146',    #Serial 7 test - 5
    'D155',    #Object naming test: scissors
    'D156',    #Object naming test: cactus

__Output variable:__

    cogn_impaired  #cognitive impairment

__Data structure of cogn_impaired:__

     1  cognitively impaired
     0  not cognitively impaired
    -3  Unknown
    -6  Deceased

In [8]:
#Normalized immediate word recall score 
data['norm_immediate_rec'] = (data.D174-data.D174.min())/(data.D174.max()-data.D174.min())
data.norm_immediate_rec.value_counts().sort_index()

#Normalized delayed word recall score 
data['norm_delayed_rec'] = (data.D184-data.D184.min())/(data.D184.max()-data.D184.min())
data.norm_delayed_rec.value_counts().sort_index()

#Normalized serial 7 score 
data['s_1'] = np.nan 
data['s_2'] = np.nan
data['s_3'] = np.nan
data['s_4'] = np.nan
data['s_5'] = np.nan

data.loc[data['D142']>=0, 's_1'] = 0 
data.loc[data['D143']>=0, 's_2'] = 0
data.loc[data['D144']>=0, 's_3'] = 0
data.loc[data['D145']>=0, 's_4'] = 0
data.loc[data['D146']>=0, 's_5'] = 0

data.loc[data['D142']==93, 's_1'] = 1
data.loc[data['D143']==86, 's_2'] = 1
data.loc[data['D144']==79, 's_3'] = 1
data.loc[data['D145']==72, 's_4'] = 1
data.loc[data['D146']==65, 's_5'] = 1

data['norm_ser7'] = data.s_1 + data.s_2 + data.s_3 + data.s_4 + data.s_5 

data.norm_ser7 = (data.norm_ser7-data.norm_ser7.min())/(data.norm_ser7.max()-data.norm_ser7.min())

#Normalized object naming score
data['on_1'] = np.nan 
data['on_2'] = np.nan

data.loc[data['D155']==5, 'on_1'] = 0 
data.loc[data['D155']==8, 'on_1'] = 0 
data.loc[data['D155']==9, 'on_1'] = 0 

data.loc[data['D156']==5, 'on_2'] = 0
data.loc[data['D156']==8, 'on_2'] = 0
data.loc[data['D156']==9, 'on_2'] = 0

data.loc[data['D155']==1, 'on_1'] = 1 
data.loc[data['D156']==1, 'on_2'] = 1

data['norm_obj_naming'] = data.on_1 + data.on_2

data.norm_obj_naming = (data.norm_obj_naming-data.norm_obj_naming.min())/(data.norm_obj_naming.max()-data.norm_obj_naming.min())

#print(data.norm_obj_naming.value_counts().sort_index())

In [9]:
#Calculate the cognition score per entry
data['cogn_score'] = data[[
     'norm_immediate_rec',
     'norm_delayed_rec',
     'norm_ser7',
     'norm_obj_naming']].mean(axis=1)

In [10]:
#Calculate the cut-off point

#Isolate cognition scores of first wave
cut = data.loc[data['wave']==8, 'cogn_score']

#Calculate cut-off point: 1.5 standard deviations below the mean
cut = cut.mean() - 1.5*cut.std()

print("Cut-off point for normalized cognition score to determine cognitive impairment:")
cut

Cut-off point for normalized cognition score to determine cognitive impairment:


0.3720686418287991

In [11]:
#Assign unknown cognitive impairment status to all individuals
data['cogn_impaired'] = -3

#Cognitively impaired
data.loc[data['cogn_score']<=cut, 'cogn_impaired'] = 1

#Not cognitively impaired
data.loc[data['cogn_score']>cut, 'cogn_impaired'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'cogn_impaired'] = -6

print("Summary statistics of data.cogn_impaired:")
print(data.cogn_impaired.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.cogn_impaired:
-6     8116
-3       33
 0    46787
 1     5440
Name: cogn_impaired, dtype: int64
See data structure above code


### 2.6. Frailty: Weight loss

Weight in previous wave minus weight in current wave is ≥10% of weight in wave previous wave or BMI <18.5 kg/m2

__Output variable:__

    frailty_weight_loss  #frailty indicator related to weight loss
 
__Data structure of frailty_weight_loss:__

     1  Meets frailty requirement related to weightloss
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [12]:
#self-reported weight: reduction of 10% or more
data['w_sr'] =  ((data.prev_weight-data.weight)/data.prev_weight)>=0.1
data['w_sr_f'] =  ((data.prev_weight-data.weight)/data.prev_weight)<0.1

#measured weight: reduction of 10% or more
#data['w_m'] =  ((data.prev_pmwght-data.pmwght)/data.prev_pmwght)>=0.1

#self-reported bmi: less than 18.5
data['bmi_sr'] =  (data.bmi)<18.5
data['bmi_sr_f'] =  (data.bmi)>=18.5

#Assign unknown cognitive impairment status to all individuals
data['frailty_weight_loss'] = -3

#Cognitively impaired
data.loc[data.w_sr|data.bmi_sr, 'frailty_weight_loss'] = 1

#Not cognitively impaired
data.loc[data.w_sr_f&data.bmi_sr_f, 'frailty_weight_loss'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty_weight_loss'] = -6

data.cogn_impaired.value_counts().sort_index()

print("Summary statistics of data.frailty_weight_loss:")
print(data.frailty_weight_loss.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty_weight_loss:
-6     8116
-3     5600
 0    43361
 1     3299
Name: frailty_weight_loss, dtype: int64
See data structure above code


### 2.7. Frailty: Exhaustion

Yes to either of two CES-D items:
(i) Felt that everything I did was an effort in last week.
(ii) Could not get going in last week.

__Output variable:__

    frailty_exhaustion  #frailty indicator related to exhaustion
 
__Data structure of frailty_exhaustion:__

     1  Meets frailty requirement related to exhaustion
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [13]:
#Assign unknown exhaustion status to all individuals
data['frailty_exhaustion'] = -3

#Exhausted
data.loc[(data.effort=="1.yes")|(data.going=="1.yes"), 'frailty_exhaustion'] = 1

#Not exhausted
data.loc[(data.effort=="0.no")&(data.going=="0.no"), 'frailty_exhaustion'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty_exhaustion'] = -6

print("Summary statistics of data.frailty_exhaustion:")
print(data.frailty_exhaustion.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty_exhaustion:
-6     8116
-3      136
 0    34371
 1    17753
Name: frailty_exhaustion, dtype: int64
See data structure above code


### 2.8. Frailty: Low energy expenditure

Frequency of three intensities of activity, lowest quintile (stratified according to sex)

Assumptions:

    An activity lasts 30mins
    
    "1.every day":    7   Activities/week
    "2.>1 per week":  3   Activities/week    
    "3.1 per week":   1   Activity/week   
    "4.l-3 per mon":  0.5 Activity/week    
    "5.never":        0   Activity/week 
    
    light physical activity:  4 kcal/min
    medium physical activity: 5   kcal/min
    vigorous physical activity:  6 kcal/min

__Output variable:__

    frailty_low_energy_exp  #frailty indicator related to energy expenditure
 
__Data structure of frailty_low_energy_exp:__

     1  Meets frailty requirement related to energy expenditure
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [14]:
data['enrg_exp_light'] = np.nan

data.loc[(data.ltactx=="1.every day"), 'enrg_exp_light'] = 7
data.loc[(data.ltactx=="2.>1 per week"), 'enrg_exp_light'] = 3
data.loc[(data.ltactx=="3.1 per week"), 'enrg_exp_light'] = 1
data.loc[(data.ltactx=="4.l-3 per mon"), 'enrg_exp_light'] = 0.5
data.loc[(data.ltactx=="5.never"), 'enrg_exp_light'] = 0

data['enrg_exp_medium'] = np.nan

data.loc[(data.mdactx=="1.every day"), 'enrg_exp_medium'] = 7
data.loc[(data.mdactx=="2.>1 per week"), 'enrg_exp_medium'] = 3
data.loc[(data.mdactx=="3.1 per week"), 'enrg_exp_medium'] = 1
data.loc[(data.mdactx=="4.l-3 per mon"), 'enrg_exp_medium'] = 0.5
data.loc[(data.mdactx=="5.never"), 'enrg_exp_medium'] = 0

data['enrg_exp_vigorous'] = np.nan

data.loc[(data.vgactx=="1.every day"), 'enrg_exp_vigorous'] = 7
data.loc[(data.vgactx=="2.>1 per week"), 'enrg_exp_vigorous'] = 3
data.loc[(data.vgactx=="3.1 per week"), 'enrg_exp_vigorous'] = 1
data.loc[(data.vgactx=="4.l-3 per mon"), 'enrg_exp_vigorous'] = 0.5
data.loc[(data.vgactx=="5.never"), 'enrg_exp_vigorous'] = 0

data.enrg_exp_light = data.enrg_exp_light*4*30
data.enrg_exp_medium = data.enrg_exp_medium*5*30
data.enrg_exp_vigorous = data.enrg_exp_vigorous*6*30

data['enrg_exp'] = data.enrg_exp_light + data.enrg_exp_medium + data.enrg_exp_vigorous

In [15]:
#Calculate the cut-off point

#Isolate energy expenditure scores of first wave
cut_enrg = data.loc[data['wave']==8, ['enrg_exp','gender']]

#Calculate cut-off point: 20th percentile
cut_enrg = (cut_enrg.groupby(['gender']).quantile([.2]))

#Remove the secondary level created
cut_enrg = cut_enrg.groupby(level=0).mean()

#Convert "gender" from an index to a column
cut_enrg = cut_enrg.reset_index(level=0)

#Rename column with cut-off point
cut_enrg = cut_enrg.rename(columns={'enrg_exp': 'cut_enrg'})

#Add the gender-specific cut-off point to each row 
data = pd.merge(data, cut_enrg, how="left", on=["gender"])

In [16]:
#Assign unknown cognitive impairment status to all individuals
data['frailty_low_energy_exp'] = -3

#Low energy expenditure
data.loc[data.enrg_exp<=data.cut_enrg, 'frailty_low_energy_exp'] = 1

#Not low energy expenditure
data.loc[data.enrg_exp>data.cut_enrg, 'frailty_low_energy_exp'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty_low_energy_exp'] = -6

print("Summary statistics of data.frailty_low_energy_exp:")
print(data.frailty_low_energy_exp.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty_low_energy_exp:
-6     8116
-3      203
 0    39335
 1    12722
Name: frailty_low_energy_exp, dtype: int64
See data structure above code


### 2.9. Frailty: Slowness

Time to walk 8 ft, converted to time to walk 15 ft. Cutoff criteria according to sex and height remain the same

__Output variable:__

    frailty_slowness  #frailty indicator related to slowness
 
__Data structure of frailty_slowness:__

     1  Meets frailty requirement related to slowness
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [17]:
#Calculate the cut-off point

#Isolate slowness-related data of first wave
walk = data.loc[data['wave']==8, ['gender','height']]


#Define quantile cut-offs per height
quant_height = (walk.groupby(['gender']).quantile([.25, .5, .75]))
quant_height = quant_height.reset_index(level=0)
quant_height = quant_height.reset_index(level=0)

#Rename column with cut-off point
quant_height = quant_height.rename(columns={'index': 'height_quantile'})

#Break dataset up into different quantile cut-offs
quant_height_25 = pd.DataFrame(quant_height[quant_height.height_quantile==0.25])
quant_height_50 = pd.DataFrame(quant_height[quant_height.height_quantile==0.5])
quant_height_75 = pd.DataFrame(quant_height[quant_height.height_quantile==0.75])

#Remove unnecessary columns
quant_height_25 = quant_height_25[["gender", "height"]]
quant_height_50 = quant_height_50[["gender", "height"]]
quant_height_75 = quant_height_75[["gender", "height"]]

#Rename height cut-off value per quantile
quant_height_25 = quant_height_25.rename(columns={'height': 'height_q25'})
quant_height_50 = quant_height_50.rename(columns={'height': 'height_q50'})
quant_height_75 = quant_height_75.rename(columns={'height': 'height_q75'})

#Add the gender-specific height cut-off point to each row 
data = pd.merge(data, quant_height_25, how="left", on=["gender"])
data = pd.merge(data, quant_height_50, how="left", on=["gender"])
data = pd.merge(data, quant_height_75, how="left", on=["gender"])


In [18]:
#Assign each individual to a height and gender stratified category
data['height_cat'] = np.nan

#height greater than .75 height_cat <- 4
data.loc[(data.height >= data.height_q75), 'height_cat'] = 4

#height between .5 and .75 height_cat <- 3
data.loc[(data.height < data.height_q75), 'height_cat'] = 3

#height between .25 and .5 height_cat <- 2
data.loc[(data.height < data.height_q50), 'height_cat'] = 2

#height smaller than .25 height_cat <- 1
data.loc[(data.height < data.height_q25), 'height_cat'] = 1




cut_wlk = data.loc[data['wave']==8, ['gender','height_cat', 'timwlk']]

#Calculate cut-off point: 20th percentile
cut_wlk = (cut_wlk.groupby(['gender', 'height_cat']).quantile([.2]))
cut_wlk = cut_wlk.reset_index(level=0)
cut_wlk = cut_wlk.reset_index(level=0)



data.height_cat.value_counts().sort_index()
cut_wlk

#Rename column with cut-off point
cut_wlk = cut_wlk.rename(columns={'timwlk': 'cut_timwlk'})

#Add the gender-and-height-specific cut-off point to each row 
data = pd.merge(data, cut_wlk, how="left", on=["gender", "height_cat"])


In [19]:
#Assign unknown slowness status to all individuals
data['frailty_slowness'] = -3

#Slowness
data.loc[data.timwlk<data.cut_timwlk, 'frailty_slowness'] = 1

#No slowness
data.loc[data.timwlk>=data.cut_timwlk, 'frailty_slowness'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty_slowness'] = -6

print("Summary statistics of data.frailty_slowness:")
print(data.frailty_slowness.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty_slowness:
-6     8116
-3    25983
 0    21596
 1     4681
Name: frailty_slowness, dtype: int64
See data structure above code


### 2.10. Frailty: Weakness

Standard BMI categories determined by the WHO (WHO, 1995)

Underweight < 18.5

Normal: 18.5-24.9

Overweight: 25-29.9

Obese: > 30

__Output variable:__

    frailty_weakness  #frailty indicator related to weakness
 
__Data structure of frailty_weakness:__

     1  Meets frailty requirement related to weakness
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [20]:
#Assign each individual to a height and gender stratified category
data['bmi_cat'] = np.nan

#Obese
data.loc[data.bmi>=30, 'bmi_cat'] = "Obese"

#Overweight
data.loc[data.bmi<30, 'bmi_cat'] = "Overweight"

#Normal
data.loc[data.bmi<25, 'bmi_cat'] = "Normal"

#Underweight
data.loc[data.bmi<18.5, 'bmi_cat'] = "Underweight"

In [21]:
cut_grip = data.loc[data['wave']==8, ['gender','bmi_cat', 'grp']]

#Calculate cut-off point: 20th percentile
cut_grip = cut_grip.groupby(['gender', 'bmi_cat']).quantile([.2])
cut_grip = cut_grip.reset_index(level=0)
cut_grip = cut_grip.reset_index(level=0)

#data.height_cat.value_counts().sort_index()
cut_grip

#Rename column with cut-off point
cut_grip = cut_grip.rename(columns={'grp': 'cut_grp'})

#Add the gender-and-height-specific cut-off point to each row 
data = pd.merge(data, cut_grip, how="left", on=["gender", "bmi_cat"])

In [22]:
#Assign unknown weakness status to all individuals
data['frailty_weakness'] = -3

#Slowness
data.loc[data.grp<data.cut_grp, 'frailty_weakness'] = 1

#No slowness
data.loc[data.grp>=data.cut_grp, 'frailty_weakness'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty_weakness'] = -6

print("Summary statistics of data.frailty_weakness:")
print(data.frailty_weakness.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty_weakness:
-6     8116
-3     2459
 0    38610
 1    11191
Name: frailty_weakness, dtype: int64
See data structure above code


### 2.11. Frailty

__Output variable:__

    frailty  #aggregate frailty
 
__Data structure of frailty:__

     1  Meets frailty requirement
     0  Does not meet requirement
    -3  Unknown
    -6  Deceased

In [23]:
#Only entries with pmwgtr > 0 has a frailty measurement ###Does not work for the most recent wave

#Count the number of frailty symptoms identified.
data['frailty_count'] = sum([(data.frailty_weight_loss==1), (data.frailty_exhaustion==1), (data.frailty_low_energy_exp==1), (data.frailty_slowness==1), (data.frailty_weakness==1)])

###Unknown frailty count???
data['frailty_unkn'] = sum([(data.frailty_weight_loss==-3), (data.frailty_exhaustion==-3), (data.frailty_low_energy_exp==-3), (data.frailty_slowness==-3), (data.frailty_weakness==-3)])

#Create frailty indicator but to not assign everyone a -3
data['frailty'] = np.nan

#Assign people who took part in the physical measures section of the survey a -3
data.loc[data.pmwgtr>0, 'frailty'] = -3

#Participated in the physical measures section of the survey and has 3 or more symptoms of frailty
data.loc[(data.pmwgtr>0)&(data.frailty_count>=3), 'frailty'] = 1

#Participated in the physical measures section of the survey and has less than 3 symptoms of frailty
data.loc[(data.pmwgtr>0)&(data.frailty_count==2)&(data.frailty_unkn==0), 'frailty'] = 0
data.loc[(data.pmwgtr>0)&(data.frailty_count==1)&(data.frailty_unkn<=1), 'frailty'] = 0
data.loc[(data.pmwgtr>0)&(data.frailty_count==0)&(data.frailty_unkn<=2), 'frailty'] = 0

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'frailty'] = -6

print("Summary statistics of data.frailty:")
print(data.frailty.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.frailty:
-6.0     8116
-3.0     7086
 0.0    41777
 1.0     3397
Name: frailty, dtype: int64
See data structure above code


# 3. Global Impression Qualification

This section determines whether or not a respondent qualifies for a specific GI segment.

### 3.1. GI segment V: Limited reserve and serious exacerbation

Life-threatening chronic condition
AND Self Reported Health reported as “poor”/”very poor”


__Output variable:__

    GI_V  #GI segment V: Limited reserve and serious exacerbation
 
__Data structure of GI_V:__

     1  Meets requirement to be segmented into GI_V
     0  Does not meet requirement
    -3  Unknown

In [24]:
data['GI_V'] = -3 #np.nan

data.loc[(data.cond_lt == -3)|(data.sr_health == -3), 'GI_V'] = -3
data.loc[(data.cond_lt == 0), 'GI_V'] = 0
data.loc[(data.sr_health == 2)|(data.sr_health == 1), 'GI_V'] = 0
data.loc[(data.cond_lt == 1)&(data.sr_health == 3), 'GI_V'] = 1

#Remove individuals who did not participate in the physical measures section of the survey
data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI_V'] = np.nan

print("Summary statistics of data.GI_V:")
print(data.GI_V.value_counts().sort_index())
print("See data structure above code")


Summary statistics of data.GI_V:
-3.0       68
 0.0    43277
 1.0     8915
Name: GI_V, dtype: int64
See data structure above code


### 3.2. GI segment IV: Long course of decline

Frail OR Cognitively impaired

__Output variable:__

    GI_IV  #GI segment IV: Long course of decline
 
__Data structure of GI_IV:__

     1  Meets requirement to be segmented into GI_IV
     0  Does not meet requirement
    -3  Unknown

In [25]:
data['GI_IV'] = np.nan

data.loc[(data.frailty == -3)|(data.cogn_impaired == -3), 'GI_IV'] = -3
data.loc[(data.frailty == 0)|(data.cogn_impaired == 0), 'GI_IV'] = 0
data.loc[(data.frailty == 1)|(data.cogn_impaired == 1), 'GI_IV'] = 1

#Remove individuals who did not participate in the physical measures section of the survey
data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI_IV'] = np.nan

print("Summary statistics of data.GI_IV:")
print(data.GI_IV.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.GI_IV:
-3.0       10
 0.0    44041
 1.0     8209
Name: GI_IV, dtype: int64
See data structure above code


### 3.3. GI segment III: Chonic stable

Life-threatening chronic condition
AND Self Reported Health reported as “very good”/”good” or ”fair”

OR

Non-life-threatening chronic condition
AND Self Reported Health reported as “fair” or ”poor”/”very poor”

OR 

Classified as depressed
AND Self Reported Health reported as “fair” or ”poor”/”very poor”


__Output variable:__

    GI_III  #GI segment III: Chonic stable
 
__Data structure of GI_III:__

     1  Meets requirement to be segmented into GI_III
     0  Does not meet requirement
    -3  Unknown

In [26]:
data['GI_III'] = np.nan

data.loc[(data.cond_nlt == -3)|(data.cond_lt == -3)|(data.depr == -3)|(data.sr_health == -3), 'GI_III'] = -3

data.loc[(data.cond_lt == 0), 'GI_III'] = 0
data.loc[(data.cond_lt == 1)&(data.sr_health == 3), 'GI_III'] = 0

data.loc[(data.cond_nlt == 0), 'GI_III'] = 0
data.loc[(data.cond_nlt == 1)&(data.sr_health == 1), 'GI_III'] = 0

data.loc[(data.depr == 0), 'GI_III'] = 0
data.loc[(data.depr == 1)&(data.sr_health == 1), 'GI_III'] = 0

data.loc[(data.cond_lt == 1)&((data.sr_health == 1)|(data.sr_health == 2)), 'GI_III'] = 1

data.loc[(data.cond_nlt == 1)&((data.sr_health == 2)|(data.sr_health == 3)), 'GI_III'] = 1

data.loc[(data.depr == 1)&((data.sr_health == 2)|(data.sr_health == 3)), 'GI_III'] = 1


#Remove individuals who did not participate in the physical measures section of the survey
data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI_III'] = np.nan

print("Summary statistics of data.GI_III:")
print(data.GI_III.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.GI_III:
-3.0        4
 0.0    16797
 1.0    35459
Name: GI_III, dtype: int64
See data structure above code


### 3.4. GI segment II: Chonic asymptomatic

__Output variable:__

    GI_II  #GI segment II: Chonic asymptomatic
 
__Data structure of GI_II:__

     1  Meets requirement to be segmented into GI_II
     0  Does not meet requirement
    -3  Unknown

In [27]:
data['GI_II'] = np.nan

data.loc[(data.cond_nlt == -3)|(data.depr == -3)|(data.sr_health == -3), 'GI_II'] = -3

data.loc[(data.cond_nlt == 0), 'GI_II'] = 0

data.loc[(data.cond_nlt == 1)&((data.sr_health == 2)|(data.sr_health == 3)), 'GI_II'] = 0

data.loc[(data.depr == 0), 'GI_II'] = 0

data.loc[(data.depr == 1)&((data.sr_health == 2)|(data.sr_health == 3)), 'GI_II'] = 0

data.loc[(data.cond_nlt == 1)&(data.sr_health == 1), 'GI_II'] = 1

data.loc[(data.depr == 1)&(data.sr_health == 1), 'GI_II'] = 1

#Remove individuals who did not participate in the physical measures section of the survey
data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI_II'] = np.nan

print("Summary statistics of data.GI_II:")
print(data.GI_II.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.GI_II:
-3.0        8
 0.0    37732
 1.0    14520
Name: GI_II, dtype: int64
See data structure above code


### 3.5. GI segment I: Healthy

__Output variable:__

    GI_I  #GI segment I: Healthy
 
__Data structure of GI_II:__

     1  Meets requirement to be segmented into GI_I
     0  Does not meet requirement
    -3  Unknown

In [28]:
data['GI_I'] = np.nan

data.loc[(data.cond_lt == -3)|(data.cond_nlt == -3)|(data.depr == -3), 'GI_I'] = -3

data.loc[(data.cond_nlt == 1)|(data.depr == 1)|(data.cond_lt == 1), 'GI_I'] = 0

data.loc[(data.cond_nlt == 0)&(data.depr == 0)&(data.cond_lt == 0), 'GI_I'] = 1

#Remove individuals who did not participate in the physical measures section of the survey
data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI_I'] = np.nan

print("Summary statistics of data.GI_I:")
print(data.GI_I.value_counts().sort_index())
print("See data structure above code")

Summary statistics of data.GI_I:
-3.0       21
 0.0    45633
 1.0     6606
Name: GI_I, dtype: int64
See data structure above code


# 4. Assign Global Impression

Respondents are assigned to the most severe GI segment that they qualify for.

__Output variable:__

    GI  #GI segment assignment
 
__Data structure of GI:__

     5  GI_V
     4  GI_IV
     3  GI_III
     2  GI_II
     1  GI_I
    -3  Unknown
    -6  Deceased

In [29]:
data['GI'] = -3 #np.nan

data.loc[(data.GI_I == 1), 'GI'] = 1
data.loc[(data.GI_II == 1), 'GI'] = 2
data.loc[(data.GI_III == 1), 'GI'] = 3
data.loc[(data.GI_IV == 1), 'GI'] = 4
data.loc[(data.GI_V == 1), 'GI'] = 5

data.loc[(data.pmwgtr<=0)|(pd.isnull(data.pmwgtr)), 'GI'] = np.nan

#Identify deceased individuals
data.loc[(data['alive'] == 5), 'GI'] = -6


print("Summary statistics of data.GI:")
print(data.GI.value_counts().sort_index())
print("See data structure above code")

print(pd.isnull(data.GI).sum())

data.GI.value_counts().sort_index().sum()
data

Summary statistics of data.GI:
-6.0     8116
-3.0       37
 1.0     5947
 2.0     8978
 3.0    22794
 4.0     5589
 5.0     8915
Name: GI, dtype: int64
See data structure above code
0


Unnamed: 0,hhidpn,gender,racohbyr,shlt,effort,going,cesd,vgactx,mdactx,ltactx,...,frailty_weakness,frailty_count,frailty_unkn,frailty,GI_V,GI_IV,GI_III,GI_II,GI_I,GI
0,3010,1.male,3.hrs,3.good,0.no,0.no,0.0,1.every day,4.l-3 per mon,3.1 per week,...,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0,3.0
1,3020,2.female,3.hrs,3.good,0.no,0.no,0.0,2.>1 per week,1.every day,2.>1 per week,...,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,3.0
2,10004010,1.male,3.hrs,4.fair,0.no,0.no,1.0,4.l-3 per mon,2.>1 per week,3.1 per week,...,0,0,0,0.0,1.0,0.0,1.0,0.0,0.0,5.0
3,10004040,2.female,4.warbabies,2.very good,0.no,0.no,0.0,2.>1 per week,2.>1 per week,2.>1 per week,...,0,0,1,0.0,0.0,0.0,0.0,1.0,0.0,2.0
4,10013010,1.male,3.hrs,3.good,0.no,0.no,2.0,5.never,5.never,5.never,...,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60371,920760020,1.male,3.hrs,3.good,1.yes,0.no,3.0,2.>1 per week,2.>1 per week,3.1 per week,...,0,1,0,0.0,0.0,0.0,1.0,0.0,0.0,3.0
60372,923497020,2.female,6.mid babyboomers,3.good,0.no,0.no,1.0,5.never,4.l-3 per mon,3.1 per week,...,0,1,1,0.0,0.0,0.0,1.0,0.0,0.0,3.0
60373,923525010,2.female,6.mid babyboomers,4.fair,1.yes,1.yes,4.0,5.never,3.1 per week,1.every day,...,0,1,1,0.0,1.0,0.0,1.0,0.0,0.0,5.0
60374,923525020,1.male,6.mid babyboomers,2.very good,1.yes,0.no,1.0,1.every day,2.>1 per week,3.1 per week,...,0,1,1,0.0,0.0,1.0,0.0,0.0,1.0,4.0


# 5. Calculate weighted Global Impressions segments

In [30]:
#Sum the weights per wave, per GI segment
data_w = data.groupby(['wave','GI'])[['pmwgtr']].sum()
data_w = data_w.reset_index()

#rename wave to specify that it is the final wave individual participated in
data_w.rename(columns = {'pmwgtr':'GI_weight'}, inplace = True)

#Remove deceased state from GI segments since their weight is always 0
data_w = data_w[data_w.GI!=-6]

# 6. Global Impressions Segmentation Statistics

In [31]:
print("GI segments of respondents in wave 8")
data[data.wave==8].GI.value_counts().sort_index()

GI segments of respondents in wave 8


-3.0       9
 1.0     816
 2.0    1387
 3.0    3114
 4.0     613
 5.0    1228
Name: GI, dtype: int64

In [32]:
print("GI segments of respondents in wave 9")
data[data.wave==9].GI.value_counts().sort_index()

GI segments of respondents in wave 9


-6.0    1284
-3.0       2
 1.0     613
 2.0    1130
 3.0    2897
 4.0     579
 5.0    1200
Name: GI, dtype: int64

In [33]:
print("GI segments of respondents in wave 10")
data[data.wave==10].GI.value_counts().sort_index()

GI segments of respondents in wave 10


-6.0    1600
-3.0       6
 1.0    1148
 2.0    1563
 3.0    3636
 4.0     746
 5.0    1366
Name: GI, dtype: int64

In [34]:
print("GI segments of respondents in wave 11")
data[data.wave==11].GI.value_counts().sort_index()

GI segments of respondents in wave 11


-6.0    1199
-3.0       7
 1.0     859
 2.0    1382
 3.0    3321
 4.0    1041
 5.0    1332
Name: GI, dtype: int64

In [35]:
print("GI segments of respondents in wave 12")
data[data.wave==12].GI.value_counts().sort_index()

GI segments of respondents in wave 12


-6.0    1341
-3.0       6
 1.0     752
 2.0    1206
 3.0    3221
 4.0     948
 5.0    1341
Name: GI, dtype: int64

In [36]:
print("GI segments of respondents in wave 13")
data[data.wave==13].GI.value_counts().sort_index()

GI segments of respondents in wave 13


-6.0    1477
-3.0       4
 1.0     982
 2.0    1222
 3.0    3511
 4.0     850
 5.0    1300
Name: GI, dtype: int64

In [37]:
print("GI segments of respondents in wave 14")
data[data.wave==14].GI.value_counts().sort_index()

GI segments of respondents in wave 14


-6.0    1215
-3.0       3
 1.0     777
 2.0    1088
 3.0    3094
 4.0     812
 5.0    1148
Name: GI, dtype: int64

# 7. Export to csv

#### Write "data_GI.csv"

Write dataset to be used in 03_Complicating_Factors.ipynb

In [38]:
filepath_data_GI = os.path.join(global_path, "data_GI.csv")

data.to_csv(filepath_data_GI, index=False)