In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir('/Users/chrissoria/Documents/Research/CADAS_1066/')

In [3]:
df = pd.read_stata('data/1066_Baseline_data.dta', convert_categoricals=False)

df.columns = df.columns.str.lower()

variables = ["mental", "activ", "memory", "put", "kept", "frdname", "famname", "convers", 
            "wordfind", "wordwrg", "past", "lastsee", "lastday", "orient", "lostout", 
            "lostin", "chores", "hobby", "money", "change", "reason", "feed", "dress", "toilet"]

CADAS = []

df.head()

Unnamed: 0,centreid,houseid,particid,houseid2,countryid,region,rural,date,interid,age,...,nparks,parkinsonism,walkdiff,vuln_inc,vuln_live1,chilocal,relweekly,frweekly,popvar,surveyok
0,1.0,1001.0,1.0,101001.0,1.0,1.0,0.0,2003-09-05 00:00:00,1.0,72.0,...,4.0,1.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1.0,1001.0,2.0,101001.0,1.0,1.0,0.0,2003-10-07 00:00:00,1.0,77.0,...,4.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1002.0,1.0,101002.0,1.0,1.0,0.0,2003-09-01 00:00:00,1.0,65.0,...,3.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1003.0,1.0,101003.0,1.0,1.0,0.0,2003-09-02 00:00:00,1.0,87.0,...,3.0,0.0,0.0,3.0,3.0,0.0,,1.0,1.0,1.0
4,1.0,1003.0,2.0,101003.0,1.0,1.0,0.0,2003-09-02 00:00:00,1.0,85.0,...,2.0,0.0,0.0,2.0,3.0,0.0,,0.0,1.0,1.0


First, we recode this set of variable's missing values to 9 /
From line 6-9 and 62-67

In [4]:
recode_9 = ["mental", "activ", "memory", "put", "kept", "frdname", "famname", "convers", 
            "wordfind", "wordwrg", "past", "lastsee", "lastday", "orient", "lostout", 
            "lostin", "chores", "hobby", "money", "change", "reason", "feed", "dress", "toilet"]

for var in recode_9:
    df[var] = df[var].fillna(9)

Then we count how many missing values there are in the bunch for each individual

In [5]:
miss1_variables = ["mental", "activ", "memory", "put", "kept", "frdname", "famname", "convers", 
            "wordfind", "wordwrg", "past", "lastsee", "lastday", "orient", "lostout", 
            "lostin", "chores", "hobby", "money", "change", "reason"]

df['miss1'] = df[variables].apply(lambda x: (x == 9).sum(), axis=1)

print(len(miss1_variables))
print(df['miss1'].value_counts())

21
0     6755
24      34
1       22
3       10
2        4
4        2
6        1
5        1
13       1
9        1
7        1
21       1
Name: miss1, dtype: int64


In [6]:
miss3_variables = ["feed", "dress", "toilet"]

df['miss3'] = df[variables].apply(lambda x: (x == 9).sum(), axis=1)

print(df['miss3'].value_counts())

0     6755
24      34
1       22
3       10
2        4
4        2
6        1
5        1
13       1
9        1
7        1
21       1
Name: miss3, dtype: int64


Then we multiple Miss3 times 3 and add it to Miss 1 \
Why?

In [7]:
df['misstot'] = (df['miss3']*3) + df['miss1']

print(df['misstot'].value_counts())

0     6755
96      34
4       22
12      10
8        4
16       2
24       1
20       1
52       1
36       1
28       1
84       1
Name: misstot, dtype: int64


Next, in this set of variables we half the score of 1 to .5 and 2 to 1.

In [8]:
recode_half = ["put", "kept", "frdname", "famname", "convers", "wordfind", "wordwrg", "past", 
            "lastsee", "lastday", "orient", "lostout", "lostin", "chores", "change", "money"]

for var in recode_half:
    df[var] = df[var].apply(lambda x: 0.5 if x == 1 else (1 if x == 2 else x))

Now, we will save an original version of these variables so that we can manipulate a new version and not lose the old version \
Then we will recode all values of dress to 0 if dressdis equals 1

In [9]:
columns_to_update = ["dress", "chores", "feed", "toilet"]
disability_flags = ["dressdis", "choredis", "feeddis", "toildis"]

for col, dis_flag in zip(columns_to_update, disability_flags):
    df[f"{col}_original"] = df[col]
    df[col] = df.apply(lambda row: 0 if row[dis_flag] == 1 else row[col], axis=1)

In [10]:
s = (
    df['activ'] + df['mental'] + df['memory'] + df['put'] + df['kept'] + df['frdname'] + 
    df['famname'] + df['convers'] + df['wordfind'] + df['wordwrg'] + df['past'] + 
    df['lastsee'] + df['lastday'] + df['orient'] + df['lostout'] + df['lostin'] + 
    df['chores'] + df['hobby'] + df['money'] + df['change'] + df['reason'] + 
    df['feed'] + df['dress'] + df['toilet']
)



df['relscore'] = (30 / (30 - df['misstot'])) * s - ((df['miss1'] + df['miss3']) * 9)

Now we will add all of these components together to create a 'relative score' \
Why 30? Because 30 is the maximum amount of possible missing for the Misstot variable \
Essentially this is creating an inverse weighting based on the amount of missing data. If very little data is missing (misstot is small), the adjustment is close to 1 and does not adjust. \
Then we multiple this inverse weighting to the sum of variable in the s \
Finally, we apply a penalty based on how many missing responses there are multiplied by nine \

$$
\text{relscore} = \left( \frac{30}{30 - \text{misstot}} \right) \times \text{S} - (\text{miss1} + \text{miss3}) \times 9
$$

where S is the main score of interest \
$$
\text{S} = \sum_{i=1}^{|S|} \mathbf{v}_i
$$

miss1 is the sum of missing values from variables 1:18 in S

$$
\text{miss1} = \sum_{i=1}^{|Q|} I(\mathbf{v}_i = 9)
$$

mis3 is the sum of missing values from variables 19:21 in S

$$
\text{miss3} = \sum_{i=1}^{|T|} I(\mathbf{v}_i = 9)
$$

and, misstot is 3 times miss3 plus miss1 \
$$
\text{misstot} = 3 \times \text{miss3} + \text{miss1}
$$

and everything all together:

$$
\text{relscore} = \left( \frac{30}{30 - \left[3 \times \sum_{i=1}^{|T|} I(\mathbf{v}_i = 9) + \sum_{i=1}^{|Q|} I(\mathbf{v}_i = 9)\right]} \right) \times \sum_{i=1}^{|S|} \mathbf{v}_i - \left[\sum_{i=1}^{|Q|} I(\mathbf{v}_i = 9) + \sum_{i=1}^{|T|} I(\mathbf{v}_i = 9)\right] \times 9
$$

Essentially what this is doing is inverse weighting the S vector (which is the sum of various cognitive measures) to reduce the weight of cases that have a lot of missingness in the s vector, and then applying an additional penalty on the S vector based on how many missing. \

relscore = A score of cognitive abilities adjusting for missing responses.

Next, we'll recode this set of variables and double the value

In [11]:
recode_double = ["put", "kept", "frdname", "famname", "convers", "wordfind", "wordwrg", "past", 
             "lastsee", "lastday", "orient", "lostout", "lostin", "chores", "change", "money"]

for var in recode_double:
    df[var] = df[var].apply(lambda x: 1 if x == 0.5 else (2 if x == 1 else x))

below we convert all the 9's back to missing values (a step that is uneccesarry if we were to just create new versions for the algo)

In [12]:
recode_9 = ["mental", "activ", "memory", "put", "kept", "frdname", "famname", "convers", 
            "wordfind", "wordwrg", "past", "lastsee", "lastday", "orient", "lostout", 
            "lostin", "chores", "hobby", "money", "change", "reason", "feed", "dress", "toilet"]

for var in recode_9:
    df[var] = df[var].apply(lambda x: np.nan if x == 9 else x)

THIS CORRECTS A GLITCH WHEREBY IF THE WHOLE CSI'D' INFORMANT INTERVIEW WAS MISSING A RELSCORE OF 0 WAS RETURNED

In [13]:
df['relscore'] = df.apply(lambda row: np.nan if row['misstot'] >= 29 else row['relscore'], axis=1)

below is the total srq score

In [14]:
df['srqtot'] = 0

for i in range(1, 21):
    df['srqtot'] += df[f'srq{i}']

In [15]:
df['srqcase'] = df['srqtot'].apply(lambda x: 0 if 0 <= x <= 7 else (1 if x > 7 else np.nan))

below we are counting both the 9 values and missing values to create a sum of all missing for the srq variables

In [16]:
srqlist = ["srq1", "srq2", "srq3", "srq4", "srq5", "srq6", "srq7", "srq8", 
            "srq9", "srq10", "srq11", "srq12", "srq13", "srq14", "srq15", 
            "srq16", "srq17", "srq18", "srq19", "srq20"]

df['srqmiss'] = (df[srqlist].apply(lambda x: (x == 9).sum(), axis=1)) + (df[srqlist].apply(lambda x: (x == np.nan).sum(), axis=1))

below we recode values where srqmiss are greater than 10 in srqtot and srqcase to missing (adjusting the total score).

In [17]:
df['srqtot'] = df.apply(lambda row: np.nan if row['srqmiss'] >= 11 else row['srqtot'], axis = 1)

df['srqcase'] = df.apply(lambda row: np.nan if row['srqmiss'] >= 11 else row['srqcase'], axis = 1)

below we calculate the ZARIT CAREGIVER BURDEN \
First we count the missings in the zb columns \
Then we add up the 1's and inverse weight the total score for the zb columns \
Then we replace the 9's with na again.

In [18]:
zb_columns = [f'zb{i}' for i in range(1, 23)]
df['zbmiss'] = df[zb_columns].apply(lambda x: (x == 9).sum(), axis=1) #count the missing

df['zbtot'] = (22 / (22 - df['zbmiss'])) * (df[zb_columns].sum(axis=1) - (df['zbmiss'] * 9))

df[zb_columns] = df[zb_columns].replace(9, np.nan)

CAREGIVER INCOME

In [19]:
cben_list = ['cben1','cben2','cben3','cben4']

for var in cben_list:
    df[var] = df[var].apply(lambda x: 0 if np.isnan(x) else (0 if x == 999999 else x))

below we are creating variables that we will later fill with 

In [20]:
variables_to_create = [
    'c_family1', 'c_family2', 'c_family3', 'c_family4',
    'c_gov1', 'c_gov2', 'c_gov3', 'c_gov4',
    'c_occup1', 'c_occup2', 'c_occup3', 'c_occup4',
    'c_disab1', 'c_disab2', 'c_disab3', 'c_disab4',
    'c_rent1', 'c_rent2', 'c_rent3', 'c_rent4',
    'c_work1', 'c_work2', 'c_work3', 'c_work4',
    'c_care1', 'c_care2', 'c_care3', 'c_care4',
    'c_oth1', 'c_oth2', 'c_oth3', 'c_oth4'
]

for var in variables_to_create:
    df[var] = 0

bellow, we assign values to the variables created above depending on the values of cbntype \
we assign c_var# the value of c_ben# depending on the cbntype

In [21]:
# For c_familyX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 4, f'c_family{i}'] = df[f'cben{i}']

# For c_govX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 1, f'c_gov{i}'] = df[f'cben{i}']

# For c_occupX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 2, f'c_occup{i}'] = df[f'cben{i}']

# For c_disabX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 3, f'c_disab{i}'] = df[f'cben{i}']

# For c_rentX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 5, f'c_rent{i}'] = df[f'cben{i}']

# For c_workX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 6, f'c_work{i}'] = df[f'cben{i}']

# For c_careX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 7, f'c_care{i}'] = df[f'cben{i}']

# For c_othX
for i in range(1, 5):
    df.loc[df[f'cbntype{i}'] == 8, f'c_oth{i}'] = df[f'cben{i}']

In [22]:
for i in range(1,5):
    df[f'cbntype{i}'] = df[f'cbntype{i}'].apply(lambda x: np.nan if x == 0 else x)

In [23]:
for i in range(1,5):
    df[f'cben{i}'] = df[f'cben{i}'].apply(lambda x: np.nan if x == 0 else x)

below we sum up the values from the variables created above \
later I will find out what these sums represent

In [24]:
df['c_family'] = df['c_family1'] + df['c_family2'] + df['c_family3'] + df['c_family4']
df['c_gov'] = df['c_gov1'] + df['c_gov2'] + df['c_gov3'] + df['c_gov4']
df['c_occup'] = df['c_occup1'] + df['c_occup2'] + df['c_occup3'] + df['c_occup4']
df['c_disab'] = df['c_disab1'] + df['c_disab2'] + df['c_disab3'] + df['c_disab4']
df['c_rent'] = df['c_rent1'] + df['c_rent2'] + df['c_rent3'] + df['c_rent4']
df['c_work'] = df['c_work1'] + df['c_work2'] + df['c_work3'] + df['c_work4']
df['c_care'] = df['c_care1'] + df['c_care2'] + df['c_care3'] + df['c_care4']
df['c_oth'] = df['c_oth1'] + df['c_oth2'] + df['c_oth3'] + df['c_oth4']

In [25]:
cols_to_replace = ['c_family', 'c_gov', 'c_occup', 'c_disab', 'c_rent', 'c_work', 'c_care', 'c_oth']
df[cols_to_replace] = df[cols_to_replace].fillna(0)

# Compute the sum for cnof_tot, a sum of all varaibles except family
df['cnof_tot'] = df['c_gov'] + df['c_occup'] + df['c_disab'] + df['c_rent'] + df['c_work'] + df['c_care'] + df['c_oth']

# Compute the sum for c_tot, a sum of all variables in the list above
df['c_tot'] = df['cnof_tot'] + df['c_family']

below we compute the tw_dep score

In [26]:
cols_to_count = ['am1', 'pm1', 'eve1', 'nite1', 'am2', 'pm2', 'eve2', 'nite2']
df['twdepmiss'] = df[cols_to_count].isnull().sum(axis=1) + (df[cols_to_count] == 9).sum(axis=1)

df['tw_dep'] = 1.5 * (df['am1'] + df['pm1'] + df['eve1'] + df['nite1'] + df['am2'] + df['pm2'] + df['eve2'] + df['nite2'])

below we compute the tadl score

In [27]:
cols_to_recode_9_to_0 = ['cashrs2', 'cashrs3', 'cashrs4', 'cashrs5', 'cashrs7', 'cashrs8']
df[cols_to_recode_9_to_0] = df[cols_to_recode_9_to_0].replace({np.nan: 0, 9: 0})

cols_to_recode_99_to_0 = ['cashrs1', 'cashrs6']
df[cols_to_recode_99_to_0] = df[cols_to_recode_99_to_0].replace({np.nan: 0, 99: 0})

df['tadl'] = df['cashrs2'] + df['cashrs3'] + df['cashrs4'] + df['cashrs5'] + df['cashrs7'] + df['cashrs8']

below we recode variables to system missing (recoding everything back to na)

In [28]:
recode_dict = {
    'caremar': {0: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan},
    'carerage': {1: np.nan, 2: np.nan, 3: np.nan, 4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan, 10: np.nan, 99: np.nan},
    'carerrel': {9: np.nan},
    'careeduc': {6: np.nan, 9: np.nan},
    'cjob': {9: np.nan},
    'cjobcat': {0: np.nan, 10: np.nan, 11: np.nan, 12: np.nan, 13: np.nan, 14: np.nan, 15: np.nan},
    'carehelp': {9: np.nan},
    'carelive': {2: np.nan, 3: np.nan, 4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan},
    'cutback': {9: np.nan, 3: 0},
    'cutwhen': {999: np.nan},
    'am1': {9: np.nan},
    'am2': {9: np.nan},
    'pm1': {9: np.nan},
    'pm2': {9: np.nan},
    'eve1': {9: np.nan},
    'eve2': {9: np.nan},
    'nite1': {9: np.nan},
    'nite2': {9: np.nan},
    'helpweek': {13: np.nan, 14: np.nan, 15: np.nan, 16: np.nan, 17: np.nan, 18: np.nan, 19: np.nan, 20: np.nan},
    'helpjob': {0: np.nan, 9: np.nan},
    'daypaid': {0: np.nan, 9: np.nan},
    'ntpaid': {0: np.nan, 9: np.nan},
    'cashrs2': {4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan},
    'cashrs3': {4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan},
    'cashrs4': {4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, 8: np.nan, 9: np.nan},
    'cashrs5': {4: np.nan, 5: np.nan, 6: np.nan, 7: np.nan, }
}

below we compute the zbcare score and carerev scores \
THIS CORRECTS A GLITCH WHEREBY PARTICIPANTS WERE CODED AS NEEDING NO CARE, BUT CARE SECTION WAS NOT SKIPPED AND CARE NEEDS WERE EVIDENT

In [29]:
# summing values within a range (1 thru Highest) for specific columns
cols_zbcare = ['zb' + str(i) for i in range(1, 23)]
df['zbcare'] = df[cols_zbcare].apply(lambda x: (x >= 1).sum(), axis=1)

cols_othcare = ['cashrs' + str(i) for i in range(1, 8)] + ['cutback', 'carehelp']
df['othcare'] = df[cols_othcare].apply(lambda x: (x >= 1).sum(), axis=1)

# Compute the sum for carerev
df['carerev'] = df['zbcare'] + df['othcare']

# Recoding specific values in carerev column
df['carerev'] = df['carerev'].apply(lambda x: 1 if x >= 1 else x)

# Conditionally recoding values in CARENEED column
df.loc[df['carerev'] == 1, 'careneed'] = df.loc[df['carerev'] == 1, 'careneed'].replace(3, 2)

THIS THEN RECODES CARE SECTION VARIABLES TO SYSMIS WHEN NO NEEDS FOR CARE WERE IDENTIFIED \
need to locate the careneed column

In [30]:
cols_to_recode = ['carewho1', 'carewho2', 'cutback', 'cutwhen', 'cuthour', 'carehelp', 'helphour', 'helpweek', 
                  'helpjob', 'daypaid', 'ntpaid', 'cashrs1', 'cashrs2', 'cashrs3', 'cashrs4', 'cashrs5', 
                  'cashrs6', 'cashrs7', 'cashrs8'] + ['zb' + str(i) for i in range(1, 23)]

# Conditionally recoding values
df.loc[df['careneed'] == 3, cols_to_recode] = np.nan

NPI-Q DISTRESS AND SEVERITY SCORES (this will be in the algo) \
First, we recode all missing values in npisev and npidis to 0 (so that na doesn't prevent the calculations from running)

What is the NPI? \
A measure of behavioural and psychological symptoms

In [38]:
#pulling out one so I can see what it's doing
#this is recoding npi severity and npi distress to 0 if the main npi question is 0
#in other words, when someone answers that the symptom is not present, we treat the following questions about it as 0  
df['npi1sev'] = df.apply(lambda row: 0 if row['npi1'] == 0 and pd.isna(row['npi1sev']) else row['npi1sev'], axis=1)
df['npi1d'] = df.apply(lambda row: 0 if row['npi1'] == 0 and pd.isna(row['npi1d']) else row['npi1d'], axis=1)

#recoding all the values of npi_sev
for i in range(1, 13):
    npi_col = f'npi{i}'
    sev_col = f'npi{i}sev'
    
    df.loc[(df[npi_col] == 0) & df[sev_col].isna(), sev_col] = 0 #here we are setting any missing scores for severity to 0

#recoding all rest of the values the dis columnm
for i in range(2, 13):
    npi_col = f'npi{i}'
    dis_col = f'npi{i}dis'
    
    df.loc[(df[npi_col] == 0) & df[dis_col].isna(), dis_col] = 0

then below we are recoding all 9's to missing

In [None]:
# List of columns to recode
cols_to_recode = ['npi1', 'npi1sev', 'npi1d', 'npi2', 'npi2sev', 'npi2dis', 
                  'npi3', 'npi3sev', 'npi3dis', 'npi4', 'npi4sev', 'npi4dis', 
                  'npi5', 'npi5sev', 'npi5dis', 'npi6', 'npi6sev', 'npi6dis', 
                  'npi7', 'npi7sev', 'npi7dis', 'npi8', 'npi8sev', 'npi8dis', 
                  'npi9', 'npi9sev', 'npi9dis', 'npi10', 'npi10sev', 'npi10dis', 
                  'npi11', 'npi11sev', 'npi11dis', 'npi12', 'npi12sev', 'npi12dis']

# Recode values
df[cols_to_recode] = df[cols_to_recode].replace(9, np.nan)

next, we checking the severity (SEV) of each NPI variable. If the severity is greater than or equal to 1, it's recoding the corresponding NPI variable to 1 if its value is 2 or higher. \
This appears to be cleaning the variable in case any responses are not in the 0,1 binary that they should be in

In [33]:
for i in range(1, 13):  # Looping from 1 to 12
    npi = f'npi{i}'
    sev = f'npi{i}sev'
    
    mask = df[sev] >= 1
    df.loc[mask, npi] = np.where(df.loc[mask, npi] >= 2, 1, df.loc[mask, npi])


#then we recode anything that remian higher than a 2 as na
for var in npi_vars:
    df[var] = df[var].apply(lambda x: np.nan if x > 1 else x)

then below we finally create the npi severity and npi distress scores based on 

In [34]:
# Compute the sum of severity scores
df['npisev'] = df['npi1sev'] + df['npi2sev'] + df['npi3sev'] + df['npi4sev'] + df['npi5sev'] + df['npi6sev'] + df['npi7sev'] + df['npi8sev'] + df['npi9sev'] + df['npi10sev'] + df['npi11sev'] + df['npi12sev']

# Compute the sum of distress scores
df['npidis'] = df['npi1d'] + df['npi2dis'] + df['npi3dis'] + df['npi4dis'] + df['npi5dis'] + df['npi6dis'] + df['npi7dis'] + df['npi8dis'] + df['npi9dis'] + df['npi10dis'] + df['npi11dis'] + df['npi12dis']

In [36]:
df['timeons'] = df['timeons'].apply(lambda x: np.nan if x == 999 else (np.nan if x == 0 else x))

below we recode all 9's back to 9 (might be redundant, but leaving just to be safe)

In [37]:
all_vars = [
    'typeons', 'ons1', 'ons2', 'ons3', 'ons4', 'ons5', 'ons6', 'ons7', 'ons8', 'ons9', 'ons10', 'ons11', 'ons12',
    'ons13', 'ons14', 'ons15', 'ons16', 'ons17', 'ons18', 'ons19', 'ons20', 'ons21', 'ons221', 'fluct', 'fluctcog',
    'fluctoft', 'graddec', 'stepwise', 'steppre1', 'steppre3', 'steppre2', 'steppre4', 'steprec1', 'steprec3', 'steprec2',
    'steprec4', 'clouding', 'confnite', 'confday', 'nocturn', 'bchange', 'bsuspic', 'birrit', 'baccuse', 'bupset', 'bfirst',
    'bvis', 'baud', 'bdelude', 'depress', 'depdur', 'cry', 'crydur', 'wishdie', 'interest', 'anhed', 'sleep', 'eat',
    'bereave', 'berwhen', 'depimp', 'toldbp', 'treatbp', 'cvevent', 'cvtype1', 'cvtype2', 'cvtype3', 'cvtype4', 'affincon',
    'angina', 'intclaud', 'midiag', 'park', 'tremor', 'initiate', 'slow', 'microg', 'heavyalc', 'alctreat', 'alcprob',
    'hypothy', 'hyperthy', 'hi', 'hill', 'loc', 'behchang', 'fitsever', 'longfits', 'earlychg', 'npi1', 'npi1sev', 'npi1d',
    'npi2', 'npi2sev', 'npi2dis', 'npi3', 'npi3sev', 'npi3dis', 'npi4', 'npi4sev', 'npi4dis', 'npi5', 'npi5sev', 'npi5dis',
    'npi6', 'npi6sev', 'npi6dis', 'npi7', 'npi7sev', 'npi7dis', 'npi8', 'npi8sev', 'npi8dis', 'npi9', 'npi9sev', 'npi9dis',
    'npi10', 'npi10sev', 'npi10dis', 'npi11', 'npi11sev', 'npi11dis', 'npi12', 'npi12sev', 'npi12dis', 'hasconf'
]

for var in all_vars:
    df[var] = df[var].apply(lambda x: np.nan if x == 9 else x)

In [None]:
df.loc[df['ons221'] == 0, 'ons221'] = np.nan
df.loc[df['depdur'] == 0, 'depdur'] = np.nan
df.loc[df['cvtype1'] == 0, 'cvtype1'] = np.nan
df.loc[df['cvtype2'] == 0, 'cvtype2'] = np.nan
df.loc[df['cvtype3'] == 0, 'cvtype3'] = np.nan
df.loc[df['cvtype4'] == 0, 'cvtype4'] = np.nan
df.loc[df['loc'] == 0, 'loc'] = np.nan
df.loc[df['berwhen'] == 0, 'berwhen'] = np.nan

df.loc[df['fluctoft'] >= 5, 'fluctoft'] = np.nan

for col in ['cvtype1', 'cvtype2', 'cvtype3', 'cvtype4', 'loc']:
    df.loc[df[col] >= 3, col] = np.nan

for col in ['affincon', 'earlychg']:
    df.loc[df[col] >= 2, col] = np.nan

for col in ['step1', 'step3', 'step2', 'step4', 'fallsno']:
    df.loc[df[col] == 99, col] = np.nan

for col in ['cvdate1', 'cvdate2', 'cvdate3', 'cvdate4', 'alcpast', 'alcnow']:
    df.loc[df[col] == 999, col] = np.nan