In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [22]:
dat = pd.read_csv('data/data.csv', low_memory=False)

## Subset 31 questions & process data

In [23]:
# Define all index ranges and individual indices
mental_ix  = list(range(25, 30)) + [83, 105]
home_ix    = list(range(88, 91)) + list(range(98, 102)) + [103]
social_ix  = [13, 15, 18] + list(range(19, 25)) + [87, 102, 104]
health_ix  = list(range(75, 78)) + [84]

# Combine and sort unique indices
all_indices = sorted(set(mental_ix + home_ix + social_ix + health_ix))

# Subset columns by index
dat1 = dat.iloc[:, all_indices].copy()

In [24]:
num_rows_with_na = dat1.isna().any(axis=1).sum()
num_rows_with_na

np.int64(10900)

In [25]:
dat_cleaned = dat1.dropna(axis=0, how='any')
dat_cleaned.shape

(9203, 31)

In [26]:
dat_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9203 entries, 1 to 11426
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Q14     9203 non-null   float64
 1   Q16     9203 non-null   float64
 2   Q19     9203 non-null   float64
 3   Q20     9203 non-null   float64
 4   Q21     9203 non-null   float64
 5   Q22     9203 non-null   float64
 6   Q23     9203 non-null   float64
 7   Q24     9203 non-null   float64
 8   Q25     9203 non-null   float64
 9   Q26     9203 non-null   float64
 10  Q27     9203 non-null   float64
 11  Q28     9203 non-null   float64
 12  Q29     9203 non-null   float64
 13  Q30     9203 non-null   float64
 14  Q76     9203 non-null   float64
 15  Q77     9203 non-null   float64
 16  Q78     9203 non-null   float64
 17  Q84     9203 non-null   float64
 18  Q85     9203 non-null   float64
 19  Q88     9203 non-null   float64
 20  Q89     9203 non-null   float64
 21  Q90     9203 non-null   float64
 22  Q91 

In [27]:
dat_cleaned = dat_cleaned.astype(int)

### Recode necessary columns

In [28]:
def recode_binary(df, cols):
    df = df.copy()
    for col in cols:
        df[col] = df[col].apply(lambda x: 1 if x == 1 else 0)
    return df

In [29]:
def reverse_recode(df, cols):
    df = df.copy()
    for col in cols:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (max_val + min_val) - df[col]
    return df

In [30]:
# Binary: 1.0 = Yes → 1, 2.0 = No → 0
binary_cols = ['Q26', 'Q27', 'Q28', 'Q106', 'Q100', 'Q101', 'Q102', 'Q19', 'Q24', 'Q25', 'Q88', 'Q105']

# Ordinal: need to reverse
ordinal_cols = [                 
    'Q99', 'Q104',                
    'Q76', 'Q77', 'Q78', 'Q85'    
]

In [31]:
dat_final1 = recode_binary(dat_cleaned, binary_cols)
dat_final = reverse_recode(dat_final1, ordinal_cols)

In [32]:
dat_final['Q30'] = dat_final['Q30'].astype(int).map({1: 0, 3: 1, 2: 2})
dat_final['Q85'] = dat_cleaned['Q85'].astype(int).map({
    1: 3,
    2: 2,
    3: 1,
    4: 0,
    5: 0,
    6: 0,
    7: 1
})

#### Checking to make sure the recode is applied correct

In [33]:
# Binary recode

print(dat_cleaned['Q27'].value_counts())
print(dat_final['Q27'].value_counts())

Q27
2    7143
1    2060
Name: count, dtype: int64
Q27
0    7143
1    2060
Name: count, dtype: int64


In [34]:
# Ordinal reverse recode

print(dat_cleaned['Q104'].value_counts())
print(dat_final['Q104'].value_counts())

Q104
5    4520
4    3299
3     792
2     362
1     230
Name: count, dtype: int64
Q104
1    4520
2    3299
3     792
4     362
5     230
Name: count, dtype: int64


In [35]:
# Special case 1 

print(dat_cleaned['Q30'].value_counts())
print(dat_final['Q30'].value_counts())

Q30
1    8290
3     717
2     196
Name: count, dtype: int64
Q30
0    8290
1     717
2     196
Name: count, dtype: int64


In [36]:
# Special case 2 (U recode)

print(dat_cleaned['Q85'].value_counts())
print(dat_final['Q85'].value_counts())

Q85
4    2550
3    2347
5    1533
2    1383
1     876
6     381
7     133
Name: count, dtype: int64
Q85
0    4464
1    2480
2    1383
3     876
Name: count, dtype: int64


## Subset the data per domain

In [39]:
# Create mapping of all_indices to relative positions
index_map = {orig_idx: i for i, orig_idx in enumerate(all_indices)}

# Remap each index list to match final data's column layout
mental_ix_local = [index_map[i] for i in mental_ix]
home_ix_local   = [index_map[i] for i in home_ix]
social_ix_local = [index_map[i] for i in social_ix]
health_ix_local = [index_map[i] for i in health_ix]

# Create combined index lists
home_context_ix   = mental_ix_local + home_ix_local
social_context_ix = mental_ix_local + social_ix_local
health_context_ix = mental_ix_local + health_ix_local

In [40]:
home_context = dat_cleaned.iloc[:, home_context_ix].copy()
social_context = dat_cleaned.iloc[:, social_context_ix].copy()
health_context = dat_cleaned.iloc[:, health_context_ix].copy()

In [42]:
print(home_context.columns.tolist())
print(social_context.columns.tolist())
print(health_context.columns.tolist())


['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q84', 'Q106', 'Q89', 'Q90', 'Q91', 'Q99', 'Q100', 'Q101', 'Q102', 'Q104']
['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q84', 'Q106', 'Q14', 'Q16', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q88', 'Q103', 'Q105']
['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q84', 'Q106', 'Q76', 'Q77', 'Q78', 'Q85']
