In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [39]:
dat = pd.read_csv('data/data.csv', low_memory=False)

In [48]:
# Define all index ranges and individual indices
mental_ix  = list(range(25, 30)) + [83, 105]
home_ix    = list(range(88, 91)) + list(range(98, 102)) + [103]
social_ix  = [13, 15, 18] + list(range(19, 25)) + [87, 102, 104]
health_ix  = list(range(75, 78)) + [84]

# Combine and sort unique indices
all_indices = sorted(set(mental_ix + home_ix + social_ix + health_ix))

# Subset columns by index
dat1 = dat.iloc[:, all_indices].copy()

In [67]:
num_rows_with_na = dat1.isna().any(axis=1).sum()
num_rows_with_na

np.int64(10900)

In [None]:
dat_cleaned = dat1.dropna(axis=0, how='any')
dat_cleaned.shape

In [81]:
# Create mapping of all_indices to relative positions
index_map = {orig_idx: i for i, orig_idx in enumerate(all_indices)}

# Remap each index list to match dat1's column layout
mental_ix_local = [index_map[i] for i in mental_ix]
home_ix_local   = [index_map[i] for i in home_ix]
social_ix_local = [index_map[i] for i in social_ix]
health_ix_local = [index_map[i] for i in health_ix]

# Now safely subset from dat_cleaned
mental = dat_cleaned.iloc[:, mental_ix_local].copy()
home   = dat_cleaned.iloc[:, home_ix_local].copy()
social = dat_cleaned.iloc[:, social_ix_local].copy()
health = dat_cleaned.iloc[:, health_ix_local].copy()


In [82]:
print(mental.columns.tolist())
print(home.columns.tolist())
print(social.columns.tolist())
print(health.columns.tolist())


['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q84', 'Q106']
['Q89', 'Q90', 'Q91', 'Q99', 'Q100', 'Q101', 'Q102', 'Q104']
['Q14', 'Q16', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q88', 'Q103', 'Q105']
['Q76', 'Q77', 'Q78', 'Q85']


In [83]:
def recode_binary(df, cols, reverse=False):
    df = df.copy()
    recode_map = {1.0: 0, 2.0: 1} if reverse else {1.0: 1, 2.0: 0}
    for col in cols:
        df[col] = df[col].replace(recode_map).astype(int)
    return df

## Mental Health Data Set

Questions: 26-30, 84, 106

In [60]:
mental_cols = ['Q26', 'Q27', 'Q28', 'Q29', 'Q30', 'Q84', 'Q106']
mental = dat[mental_cols].copy()

In [61]:
mental.head()

Unnamed: 0,Q26,Q27,Q28,Q29,Q30,Q84,Q106
0,1.0,2.0,2.0,1.0,1.0,1.0,2.0
1,2.0,2.0,2.0,1.0,1.0,3.0,1.0
2,1.0,2.0,2.0,1.0,1.0,2.0,2.0
3,1.0,2.0,2.0,1.0,1.0,3.0,1.0
4,1.0,2.0,1.0,1.0,1.0,3.0,2.0


In [62]:
mental_binary = ['Q26', 'Q27', 'Q28', 'Q106']
mental = recode_binary(mental, mental_binary)


IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer