In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir('/Users/chrissoria/Documents/Research/CADAS_1066/')  

In [3]:
df = pd.read_stata('data/1066_Baseline_data.dta', convert_categoricals=False)

df.columns = df.columns.str.lower()

df.to_csv('data/1066_Baseline_data.csv')

#here I'm building a variable match list
variable_list = ['pencil', 'watch', 'chair', 'shoes', 'knuckle', 'elbow', 'should', 'bridge', 'hammer', 
                 'pray', 'chemist', 'repeat', 'town', 'chief', 'street', 'store', 'address', 'longmem', 
                 'month', 'day', 'year', 'season', 'nod', 'point', 'circle', 'pentag','animals','wordimm','worddel',
                'paper','story','learn1','learn2','learn3','recall']

CADAS_match = ['c_24','c_25','c_48','c_49','c_50','c_51','c_52','c_53','c_54','c_55','c_56','c_26','c_8',
               'c_70 (CU/DR)/c_71(PR)','c_58','c_59','c_60','c_3','c_5','c_61',
               'c_3','c_5','c_61','c_62,','c_72_1','c_32','c_40','sum of c_11-13','sum of c_21-23','sum of c_27-29',
              'sum of c_66','c_33,','c_34','c_35','c_63']

data = {'Variable': variable_list, 'CADAS_Match': CADAS_match}
data = pd.DataFrame(data)
data.to_csv('../CADAS/10_66_algo_var_match.csv', index=False)

df.head()

Unnamed: 0,centreid,houseid,particid,houseid2,countryid,region,rural,date,interid,age,...,nparks,parkinsonism,walkdiff,vuln_inc,vuln_live1,chilocal,relweekly,frweekly,popvar,surveyok
0,1.0,1001.0,1.0,101001.0,1.0,1.0,0.0,2003-09-05 00:00:00,1.0,72.0,...,4.0,1.0,1.0,3.0,1.0,1.0,1.0,0.0,1.0,1.0
1,1.0,1001.0,2.0,101001.0,1.0,1.0,0.0,2003-10-07 00:00:00,1.0,77.0,...,4.0,1.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1002.0,1.0,101002.0,1.0,1.0,0.0,2003-09-01 00:00:00,1.0,65.0,...,3.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1003.0,1.0,101003.0,1.0,1.0,0.0,2003-09-02 00:00:00,1.0,87.0,...,3.0,0.0,0.0,3.0,3.0,0.0,,1.0,1.0,1.0
4,1.0,1003.0,2.0,101003.0,1.0,1.0,0.0,2003-09-02 00:00:00,1.0,85.0,...,2.0,0.0,0.0,2.0,3.0,0.0,,0.0,1.0,1.0


Below, we are sorting by house id and particpant id

In [4]:
df = df.sort_values(by=['houseid', 'particid'])

Next, we want to pull out the name recall scores. What does this tell me?

According to the table below, 6650 people were able to at least repeat the name of the interviewer once.

In [5]:
df['nametot'] = 0

#c_0 in CADAS
df['nametot'] = np.where(df['name'] > 0, 1, df['nametot']) #I'm assuming this is where someone is asked to repeat a name
#c_65 in CADAS
df['nametot'] = np.where(df['nrecall'] > 0, 1, df['nametot']) #I'm assuming this is the name recall from cognitve

print(df['nametot'].value_counts())

1    6650
0     183
Name: nametot, dtype: int64


Next, it looks like we're calcuting a score based on different tests in the cognitive

In [6]:
# Counting the number of 1s in specific columns
df['count'] = df[['pencil', 'watch', 'chair', 'shoes', 'knuckle', 'elbow', 'should', 'bridge', 'hammer', 
                 'pray', 'chemist', 'repeat', 'town', 'chief', 'street', 'store', 'address', 'longmem', 
                 'month', 'day', 'year', 'season', 'nod', 'point', 'circle', 'pentag']].sum(axis=1)

Next, we want to recode all missing values to 0 in order that the algo doesn't break (however, a bettter solution is possible?)

In [7]:
# Recoding values from na to 0

columns_to_replace_sysmis = ['animals', 'wordimm', 'worddel', 'paper', 'story', 'learn1', 'learn2', 
                             'learn3', 'recall', 'pencil', 'watch', 'chair', 'shoes', 'knuckle', 'elbow', 
                             'should', 'bridge', 'hammer', 'pray', 'chemist', 'repeat', 'town', 'chief', 
                             'street', 'store', 'address', 'longmem', 'month', 'day', 'year', 'season', 
                             'nod', 'point', 'circle', 'pentag', 'nametot', 'nrecall']

for col in columns_to_replace_sysmis:
    df[col] = df[col].replace(np.nan, 0)

recoding 9's and 99's to 0, I'll have to look at the codebook but i think this may be no sabe
However, there are neither in paper, story, worddel, wordimn, and no 99 in any. This line of code is pretty superflous for this data but maybe not for ours.

In [8]:
columns_to_replace_99 = ['animals','wordimm','worddel','paper','story']

columns_to_replace_9 = ['wordimm','worddel','paper','story']

for col in columns_to_replace_99:
    df[col] = df[col].replace(99, 0)
    
for col in columns_to_replace_9:
    df[col] = df[col].replace(9, 0)

In [9]:
columns_to_recode = ['learn1', 'learn2', 'learn3', 'recall']

for col in columns_to_recode:
    # Map specific values
    df[col] = df[col].replace(11, 1)
    df[col] = df[col].replace({20: 2, 21: 2})
    df[col] = df[col].replace({30: 3, 31: 3})
    df[col] = df[col].replace({40: 4, 41: 4})
    df[col] = df[col].replace({50: 5, 51: 5})
    df[col] = df[col].replace({60: 6, 61: 6})
    df[col] = df[col].replace({70: 7, 71: 7})
    df[col] = df[col].replace({80: 8, 81: 8})
    df[col] = df[col].replace({90: 9, 91: 9})
    # Map 99 to sysmis (in pandas, we usually use NaN from the numpy library to represent missing data)
    df[col] = df[col].replace(99, np.nan)

Let's recode any value greater than 1 and less than 10 to a missing value. In other words, this removes anything that isn't correct or incorrect (such as no pudo, errors, etc). Really, the less than an equal to 9 function is superflous, and should be any number greater than 2 for this set of questions. However, we will leave it since that's what the original code has.

In [10]:
columns_to_recode = ['name', 'pencil', 'watch', 'chair', 'shoes', 'knuckle', 'elbow', 
                     'should', 'bridge', 'hammer', 'pray', 'chemist', 'repeat', 'town', 
                     'chief', 'street', 'store', 'address', 'longmem', 'month', 'day', 
                     'year', 'season', 'nod', 'point', 'circle', 'pentag']

for col in columns_to_recode:
    df[col] = df[col].apply(lambda x: np.nan if 2 <= x <= 9 else x)

Next, let's assume that any numbers greater than the numbers in the second list, corresponding to the variables in the first list, are errors and treat them ass missing.

In [11]:
greater_than_var = ['animals','wordimm','worddel','paper','story','recall','immed','nrecall']
greater_than_number = [45,3,3,3,6,10,29,1]

for col, num in zip(greater_than_var, greater_than_number):
    df[col] = df[col].apply(lambda x: np.nan if x > num else x)

Now, we will divide the scores by the possible perfect score in order to get a number that's 1 or less. Question for Will, why is the animals question divided by 23 when it's possible to give an answer above that? We last recoded to allow the greatest number to be 45 (some 40's in there. 

In [12]:
divide_var = ['animals','wordimm','worddel','paper','story']
divisor = [23,3,3,3,6]
new_column = ['animtot','wordtot1','wordtot2','papertot','storytot']

for col,num,new in zip(divide_var,divisor,new_column):
    df[new] = df[col]/num

Below, we will calculate the global cognitive score 

In [13]:
df['cogscore'] = 1.03125 * (df['nametot'] + df['count'] + df['animtot'] + df['wordtot1'] + 
                            df['wordtot2'] + df['papertot'] + df['storytot'])

min_value = df['cogscore'].min()
max_value = df['cogscore'].max()

print(f"Range of cogscore: {min_value} to {max_value}")

Range of cogscore: 0.0 to 33.76222826086956


Next, an immediate recall score

In [14]:
df['immed'] = df['learn1'] + df['learn2'] + df['learn3']

language expression

In [15]:
df['langexpr'] = df['bridge'] + df['hammer'] + df['pray'] + df['chemist']

language comprehension

In [16]:
df['langcomp'] = df['nod'] + df['point']

orientation in time

In [17]:
df['orientti'] = df['month'] + df['day'] + df['year'] + df['season']

orientation in space

In [18]:
df['orientsp'] = df['town'] + df['street'] + df['store'] + df['address']

object name identification

In [19]:
df['objname'] = df['pencil'] + df['watch'] + df['chair'] + df['shoes'] + df['knuckle'] + df['elbow'] + df['should']

memory (combined delayed and immediate)

In [20]:
df['mem'] = df['worddel'] + df['wordimm'] + df['nrecall'] + df['story']

a language score (combining language expression and comprehension)

In [21]:
df['language'] = df['langexpr'] + df['langcomp']

finally, an overall orientation score

In [22]:
df['orientat'] = df['orientti'] + df['orientsp'] + df['chief']

Next, we perform the imputation (based on a linear regression) \
pred_recall=0.344×immed−0.339 \
 \
Coefficient of immed (0.344): This value represents the weight or importance of the immed variable in predicting recall. Specifically, for every one-unit increase in immed, the predicted recall score increases by approximately 0.344 units, all else being constant. \
 \
Constant Term (-0.339): This is the y-intercept or the baseline value of predicted recall when immed is zero. This means if someone has a short-term memory score (immed) of 0, their predicted recall would be -0.339 (we will adjust 0 to be the floor) \
 \
The equation assumes that there's a linear relationship between short-term memory (immed) and the variable we are trying to impute (likely some measure of recall). This relationship is derived from observed data where both variables are known. \
Basically, we are using the predictive power of immediate recall questions to fill in the blanks

In [23]:
df['pred_recall'] = (.344*df['immed'])-.339
df['pred_recall'] = df['pred_recall'].apply(lambda x: 0 if x < 0 else (10 if x > 10 else x))

min_value = df['pred_recall'].min()
max_value = df['pred_recall'].max()

print(f"Range of pred_recall: {min_value} to {max_value}")

Range of pred_recall: 0.0 to 9.636999999999999


In [24]:
print(df['recall'].value_counts(dropna=False))

5.000     1270
4.000     1203
3.000      924
6.000      918
7.000      580
2.000      574
0.000      553
1.000      333
8.000      303
9.000      106
10.000      54
3.445        2
4.477        2
5.509        2
4.821        2
3.789        1
2.413        1
4.133        1
5.853        1
7.573        1
7.917        1
5.165        1
Name: recall, dtype: int64


In [25]:
df['recall_original'] = df['recall'] #keeping the original

df['recall'] = df['recall'].apply(lambda x: 999 if np.isnan(x) else x)

print(df['recall'].value_counts(dropna=False))

5.000     1270
4.000     1203
3.000      924
6.000      918
7.000      580
2.000      574
0.000      553
1.000      333
8.000      303
9.000      106
10.000      54
3.445        2
4.477        2
5.509        2
4.821        2
3.789        1
2.413        1
4.133        1
5.853        1
7.573        1
7.917        1
5.165        1
Name: recall, dtype: int64


below we are taking the missing values, which we recoded to 999, and slotting in the predicted value for recall based on the regression above. Now, all people, including missing inputs, have a value for recall.

In [26]:
df['recall'] = df['recall'].apply(lambda x: df['pred_recall'] if x == 999 else x)
df['recall'] = df['recall'].apply(lambda x: np.nan if x > 11 else x)

min_value = df['recall'].min()
max_value = df['recall'].max()

print(f"Range of recall: {min_value} to {max_value}")

Range of recall: 0.0 to 10.0
