In [5]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

in_csv  = "/content/drive/MyDrive/MSML610 Project/CSV/INQ_L.csv"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)
out_csv = os.path.join(out_dir, "INQ_L_meaningful.csv")

CODE_TO_NAME = {
    "SEQN":      "respondent_id",
    "INDFMMPI":  "family_monthly_poverty_index",
    "INDFMMPC":  "family_monthly_poverty_category",
    "INQ300":    "savings_more_than_20000",
    "IND310":    "total_family_savings_cash_assets"
}

df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/INQ_L_meaningful.csv


In [7]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv  = os.path.join(base_dir, "INQ_L_meaningful.csv")
out_csv = os.path.join(base_dir, "INQ_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv)


def map_poverty_index(val):
    if pd.isna(val):
        return pd.NA
    try:
        v = float(val)
    except:
        return pd.NA
    if v == 5:
        return ">=5.00"
    return v  # keep actual ratio (0–4.94)

if "family_monthly_poverty_index" in df.columns:
    df["family_monthly_poverty_index"] = df["family_monthly_poverty_index"].apply(map_poverty_index)

poverty_cat_map = {
    1: "Monthly poverty index ≤ 1.30",
    2: "1.30 < monthly poverty index ≤ 1.85",
    3: "Monthly poverty index > 1.85",
    7: "Refused",
    9: "Don't know",
    ".": pd.NA
}

col = "family_monthly_poverty_category"
if col in df.columns:
    df[col] = df[col].replace(poverty_cat_map)
    # If read as float: map numerically too
    df[col] = df[col].replace({
        7.0: "Refused",
        9.0: "Don't know"
    })

# ---------- INQ300: Family has savings more than $20,000 ----------
inq300_map = {
    1: "Yes",
    2: "No",
    7: "Refused",
    9: "Don't know",
    ".": pd.NA
}

col = "savings_more_than_20000"
if col in df.columns:
    df[col] = df[col].replace(inq300_map)
    df[col] = df[col].replace({
        1.0: "Yes",
        2.0: "No",
        7.0: "Refused",
        9.0: "Don't know"
    })

# ---------- IND310: Total savings/cash assets for the family ----------
ind310_map = {
    1:  "Less than $3000",
    2:  "$3001–$5000",
    3:  "$5001–$10000",
    4:  "$10001–$15000",
    5:  "$15001–$20000",
    77: "Refused",
    99: "Don't know",
    ".": pd.NA
}

col = "total_family_savings_cash_assets"
if col in df.columns:
    df[col] = df[col].replace(ind310_map)
    df[col] = df[col].replace({
        1.0:  "Less than $3000",
        2.0:  "$3001–$5000",
        3.0:  "$5001–$10000",
        4.0:  "$10001–$15000",
        5.0:  "$15001–$20000",
        77.0: "Refused",
        99.0: "Don't know"
    })

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/INQ_L_meaningful_labeled.csv


In [8]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
in_csv   = os.path.join(base_dir, "ALQ_L.csv")
out_dir  = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)
renamed_csv = os.path.join(out_dir, "ALQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":    "respondent_id",
    "ALQ111":  "ever_had_alcohol",
    "ALQ121":  "freq_drink_past_12mo",
    "ALQ130":  "avg_drinks_per_drinking_day_12mo",
    "ALQ142":  "freq_4or5plus_drinks_day_12mo",
    "ALQ270":  "freq_4or5plus_drinks_2hr_12mo",
    "ALQ280":  "freq_8plus_drinks_day_12mo",
    "ALQ151":  "ever_4or5plus_almost_every_day",
    "ALQ170":  "freq_4or5plus_drinks_30d"
}

df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)
df.to_csv(renamed_csv, index=False)
print(f"Saved renamed file to: {renamed_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/ALQ_L_meaningful.csv


In [9]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir    = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv      = os.path.join(base_dir, "ALQ_L_meaningful.csv")
labeled_csv = os.path.join(base_dir, "ALQ_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv)

# Helper to treat numeric/str safely
def _to_num(x):
    try:
        return float(x)
    except:
        return x

# ---------- ALQ111: Ever had a drink of any kind of alcohol ----------
alq111_map = {
    1: "Yes",
    2: "No",
    7: "Refused",
    9: "Don't know"
}
col = "ever_had_alcohol"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq111_map)

# ---------- ALQ121: Past 12 mos how often drink alc bev ----------
alq121_map = {
    0:  "Never in the last year",
    1:  "Every day",
    2:  "Nearly every day",
    3:  "3 to 4 times a week",
    4:  "2 times a week",
    5:  "Once a week",
    6:  "2 to 3 times a month",
    7:  "Once a month",
    8:  "7 to 11 times in the last year",
    9:  "3 to 6 times in the last year",
    10: "1 to 2 times in the last year",
    77: "Refused",
    99: "Don't know"
}
col = "freq_drink_past_12mo"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq121_map)

# ---------- ALQ130: Avg # alcoholic drinks/day/past 12 mos ----------
# 1–14 = that many drinks; 15 = 15+; 777/999 special.
def map_alq130(val):
    if pd.isna(val):
        return pd.NA
    v = _to_num(val)
    if isinstance(v, (int, float)):
        if v in range(1, 15):
            return int(v)  # keep actual count
        if v == 15:
            return "15 or more drinks"
        if v == 777:
            return "Refused"
        if v == 999:
            return "Don't know"
    return pd.NA

col = "avg_drinks_per_drinking_day_12mo"
if col in df.columns:
    df[col] = df[col].apply(map_alq130)

# ---------- ALQ142: # days have 4/5 drinks/past 12 mos ----------
alq142_map = {
    0:  "Never in the last year",
    1:  "Every day",
    2:  "Nearly every day",
    3:  "3 to 4 times a week",
    4:  "2 times a week",
    5:  "Once a week",
    6:  "2 to 3 times a month",
    7:  "Once a month",
    8:  "7 to 11 times in the last year",
    9:  "3 to 6 times in the last year",
    10: "1 to 2 times in the last year",
    77: "Refused",
    99: "Don't know"
}
col = "freq_4or5plus_drinks_day_12mo"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq142_map)

# ---------- ALQ270: # times 4/5 drinks in 2hrs/past 12 mos ----------
alq270_map = {
    0:  "Never in the last year",
    1:  "Every day",
    2:  "Nearly every day",
    3:  "3 to 4 times a week",
    4:  "2 times a week",
    5:  "Once a week",
    6:  "2 to 3 times a month",
    7:  "Once a month",
    8:  "7 to 11 times in the last year",
    9:  "3 to 6 times in the last year",
    10: "1 to 2 times in the last year",
    77: "Refused",
    99: "Don't know"
}
col = "freq_4or5plus_drinks_2hr_12mo"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq270_map)

# ---------- ALQ280: # times 8+ drinks in 1 day/past 12 mos ----------
alq280_map = {
    0:  "Never in the last year",
    1:  "Every day",
    2:  "Nearly every day",
    3:  "3 to 4 times a week",
    4:  "2 times a week",
    5:  "Once a week",
    6:  "2 to 3 times a month",
    7:  "Once a month",
    8:  "7 to 11 times in the last year",
    9:  "3 to 6 times in the last year",
    10: "1 to 2 times in the last year",
    77: "Refused",
    99: "Don't know"
}
col = "freq_8plus_drinks_day_12mo"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq280_map)

# ---------- ALQ151: Ever have 4/5+ drinks almost every day ----------
alq151_map = {
    1: "Yes",
    2: "No",
    7: "Refused",
    9: "Don't know"
}
col = "ever_4or5plus_almost_every_day"
if col in df.columns:
    df[col] = df[col].apply(_to_num).replace(alq151_map)

# ---------- ALQ170: # times 4/5 drinks on occasion/past mo ----------
# 0–20 = exact; 30 = >20; 777/999 special.
def map_alq170(val):
    if pd.isna(val):
        return pd.NA
    v = _to_num(val)
    if isinstance(v, (int, float)):
        if 0 <= v <= 20:
            return int(v)
        if v == 30:
            return "More than 20 times"
        if v == 777:
            return "Refused"
        if v == 999:
            return "Don't know"
    return pd.NA

col = "freq_4or5plus_drinks_30d"
if col in df.columns:
    df[col] = df[col].apply(map_alq170)

# ---------- Save ----------
df.to_csv(labeled_csv, index=False)
print(f"Saved labeled file to: {labeled_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/ALQ_L_meaningful_labeled.csv


In [12]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir   = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir   = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv    = os.path.join(raw_dir, "PAQ_L.csv")
out_csv   = os.path.join(out_dir, "PAQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":     "respondent_id",
    "PAD790Q":  "freq_moderate_ltpa",              # Frequency of moderate leisure-time PA
    "PAD790U":  "moderate_ltpa_unit",              # Unit: day/week/month/year
    "PAD800":   "minutes_moderate_ltpa",           # Minutes of moderate LTPA
    "PAD810Q":  "freq_vigorous_ltpa",              # Frequency of vigorous leisure-time PA
    "PAD810U":  "vigorous_ltpa_unit",              # Unit: day/week/month/year
    "PAD820":   "minutes_vigorous_ltpa",           # Minutes of vigorous LTPA
    "PAD680":   "minutes_sedentary_activity"       # Minutes sedentary activity
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/PAQ_L_meaningful.csv


In [16]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "PAQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "PAQ_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv)

# ---------- Helpers ----------

def parse_int_or_none(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s.lstrip("+-").isdigit():
        return int(s)
    # only accept floats that are actually integers (e.g. 3.0)
    try:
        f = float(s)
        if f.is_integer():
            return int(f)
    except:
        pass
    return None

def map_freq_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return pd.NA

def map_minutes_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return pd.NA

def map_b_unit_any(val):
    """
    Handles values like:
      b'W', b"W", b'D', b'M', b'Y', b"", b, etc.
    Logic:
      - Convert to string
      - If it contains D/W/M/Y (case-insensitive), map accordingly
      - Otherwise -> Missing
    """
    if pd.isna(val):
        return pd.NA

    s = str(val).strip()
    if not s:
        return pd.NA

    up = s.upper()

    # Look for exact unit letters anywhere after the leading b/quotes mess
    if 'D' in up:
        return "Day"
    if 'W' in up:
        return "Week"
    if 'M' in up:
        return "Month"
    if 'Y' in up:
        return "Year"

    # if no recognizable unit letter -> treat as missing
    return pd.NA

# ---------- Apply mappings ----------

# PAD790Q - Frequency of moderate LTPA (0–180, 7777, 9999)
if "freq_moderate_ltpa" in df.columns:
    df["freq_moderate_ltpa"] = df["freq_moderate_ltpa"].apply(lambda v: map_freq_code(v, 180))

# PAD790U - Moderate LTPA unit (b'W', b'D', etc.)
if "moderate_ltpa_unit" in df.columns:
    df["moderate_ltpa_unit"] = df["moderate_ltpa_unit"].apply(map_b_unit_any)

# PAD800 - Minutes moderate LTPA (1–720, 7777, 9999)
if "minutes_moderate_ltpa" in df.columns:
    df["minutes_moderate_ltpa"] = df["minutes_moderate_ltpa"].apply(lambda v: map_minutes_code(v, 720))

# PAD810Q - Frequency of vigorous LTPA (0–200, 7777, 9999)
if "freq_vigorous_ltpa" in df.columns:
    df["freq_vigorous_ltpa"] = df["freq_vigorous_ltpa"].apply(lambda v: map_freq_code(v, 200))

# PAD810U - Vigorous LTPA unit
if "vigorous_ltpa_unit" in df.columns:
    df["vigorous_ltpa_unit"] = df["vigorous_ltpa_unit"].apply(map_b_unit_any)

# PAD820 - Minutes vigorous LTPA (1–900, 7777, 9999)
if "minutes_vigorous_ltpa" in df.columns:
    df["minutes_vigorous_ltpa"] = df["minutes_vigorous_ltpa"].apply(lambda v: map_minutes_code(v, 900))

# PAD680 - Minutes sedentary activity (0–1380, 7777, 9999)
if "minutes_sedentary_activity" in df.columns:
    df["minutes_sedentary_activity"] = df["minutes_sedentary_activity"].apply(lambda v: map_minutes_code(v, 1380))

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/PAQ_L_meaningful_labeled.csv


In [17]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "PAQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "PAQ_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv)

# ---------- Helpers ----------

def parse_int_or_none(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s.lstrip("+-").isdigit():
        return int(s)
    # only accept floats that are actually integers (e.g. 3.0)
    try:
        f = float(s)
        if f.is_integer():
            return int(f)
    except:
        pass
    return None

def map_freq_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return pd.NA

def map_minutes_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return pd.NA

def map_b_unit_any(val):
    """
    Handles values like:
      b'W', b"W", b'D', b'M', b'Y', b"", b, etc.
    Logic:
      - Convert to string
      - If it contains D/W/M/Y (case-insensitive), map accordingly
      - Otherwise -> Missing
    """
    if pd.isna(val):
        return pd.NA

    s = str(val).strip()
    if not s:
        return pd.NA

    up = s.upper()

    # Look for exact unit letters anywhere after the leading b/quotes mess
    if 'D' in up:
        return "Day"
    if 'W' in up:
        return "Week"
    if 'M' in up:
        return "Month"
    if 'Y' in up:
        return "Year"

    # if no recognizable unit letter -> treat as missing
    return pd.NA

# ---------- Apply mappings ----------

# PAD790Q - Frequency of moderate LTPA (0–180, 7777, 9999)
if "freq_moderate_ltpa" in df.columns:
    df["freq_moderate_ltpa"] = df["freq_moderate_ltpa"].apply(lambda v: map_freq_code(v, 180))

# PAD790U - Moderate LTPA unit (b'W', b'D', etc.)
if "moderate_ltpa_unit" in df.columns:
    df["moderate_ltpa_unit"] = df["moderate_ltpa_unit"].apply(map_b_unit_any)

# PAD800 - Minutes moderate LTPA (1–720, 7777, 9999)
if "minutes_moderate_ltpa" in df.columns:
    df["minutes_moderate_ltpa"] = df["minutes_moderate_ltpa"].apply(lambda v: map_minutes_code(v, 720))

# PAD810Q - Frequency of vigorous LTPA (0–200, 7777, 9999)
if "freq_vigorous_ltpa" in df.columns:
    df["freq_vigorous_ltpa"] = df["freq_vigorous_ltpa"].apply(lambda v: map_freq_code(v, 200))

# PAD810U - Vigorous LTPA unit
if "vigorous_ltpa_unit" in df.columns:
    df["vigorous_ltpa_unit"] = df["vigorous_ltpa_unit"].apply(map_b_unit_any)

# PAD820 - Minutes vigorous LTPA (1–900, 7777, 9999)
if "minutes_vigorous_ltpa" in df.columns:
    df["minutes_vigorous_ltpa"] = df["minutes_vigorous_ltpa"].apply(lambda v: map_minutes_code(v, 900))

# PAD680 - Minutes sedentary activity (0–1380, 7777, 9999)
if "minutes_sedentary_activity" in df.columns:
    df["minutes_sedentary_activity"] = df["minutes_sedentary_activity"].apply(lambda v: map_minutes_code(v, 1380))

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/PAQ_L_meaningful_labeled.csv


In [18]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "PAQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "PAQ_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv)

# ---------- Helpers ----------

def parse_int_or_none(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s.lstrip("+-").isdigit():
        return int(s)
    try:
        f = float(s)
        if f.is_integer():
            return int(f)
    except:
        pass
    return None

def map_freq_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return "Missing"

def map_minutes_code(val, max_valid):
    v = parse_int_or_none(val)
    if v is None:
        return pd.NA
    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return "Missing"

def map_b_unit_any(val):
    """
    Values like b'W', b"M", b'D', b'Y', b"", b, etc.
    Map:
      D -> Day, W -> Week, M -> Month, Y -> Year
      otherwise -> Missing
    """
    if pd.isna(val):
        return "Missing"

    s = str(val).strip()
    if not s:
        return "Missing"

    up = s.upper()

    if 'D' in up:
        return "Day"
    if 'W' in up:
        return "Week"
    if 'M' in up:
        return "Month"
    if 'Y' in up:
        return "Year"

    return "Missing"

# ---------- Apply mappings ----------

# PAD790Q - Frequency of moderate LTPA (0–180, 7777, 9999)
if "freq_moderate_ltpa" in df.columns:
    df["freq_moderate_ltpa"] = df["freq_moderate_ltpa"].apply(lambda v: map_freq_code(v, 180))

# PAD790U - Moderate LTPA unit
if "moderate_ltpa_unit" in df.columns:
    df["moderate_ltpa_unit"] = df["moderate_ltpa_unit"].apply(map_b_unit_any)

# PAD800 - Minutes moderate LTPA (1–720, 7777, 9999)
if "minutes_moderate_ltpa" in df.columns:
    df["minutes_moderate_ltpa"] = df["minutes_moderate_ltpa"].apply(lambda v: map_minutes_code(v, 720))

# PAD810Q - Frequency of vigorous LTPA (0–200, 7777, 9999)
if "freq_vigorous_ltpa" in df.columns:
    df["freq_vigorous_ltpa"] = df["freq_vigorous_ltpa"].apply(lambda v: map_freq_code(v, 200))

# PAD810U - Vigorous LTPA unit
if "vigorous_ltpa_unit" in df.columns:
    df["vigorous_ltpa_unit"] = df["vigorous_ltpa_unit"].apply(map_b_unit_any)

# PAD820 - Minutes vigorous LTPA (1–900, 7777, 9999)
if "minutes_vigorous_ltpa" in df.columns:
    df["minutes_vigorous_ltpa"] = df["minutes_vigorous_ltpa"].apply(lambda v: map_minutes_code(v, 900))

# PAD680 - Minutes sedentary activity (0–1380, 7777, 9999)
if "minutes_sedentary_activity" in df.columns:
    df["minutes_sedentary_activity"] = df["minutes_sedentary_activity"].apply(lambda v: map_minutes_code(v, 1380))

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/PAQ_L_meaningful_labeled.csv


In [19]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "PAQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "PAQ_L_meaningful_labeled.csv")

# Read as strings so we can clean weird values like b'W', 5.40E-79, etc.
df = pd.read_csv(in_csv, dtype=str)


# ---------- Helpers ----------

def map_freq(val, max_valid):
    """
    For PAD790Q / PAD810Q:
      0..max_valid -> keep numeric int
      7777         -> 'Refused'
      9999         -> 'Don't know'
      blank/./weird-> 'Missing'
    """
    if val is None:
        return "Missing"
    s = str(val).strip()
    if s == "" or s in {".", "NA", "NaN", "nan"}:
        return "Missing"

    try:
        v = int(float(s))
    except ValueError:
        return "Missing"

    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return "Missing"


def map_minutes(val, max_valid):
    """
    For PAD800 / PAD820 / PAD680:
      0..max_valid -> keep numeric int
      7777         -> 'Refused'
      9999         -> 'Don't know'
      blank/./weird-> 'Missing'
    """
    if val is None:
        return "Missing"
    s = str(val).strip()
    if s == "" or s in {".", "NA", "NaN", "nan"}:
        return "Missing"

    try:
        v = int(float(s))
    except ValueError:
        return "Missing"

    if 0 <= v <= max_valid:
        return v
    if v == 7777:
        return "Refused"
    if v == 9999:
        return "Don't know"
    return "Missing"


def map_unit(val):
    """
    For PAD790U / PAD810U values like:
      b'W', b"W", b'D', b'M', b'Y', b"", b, '', etc.

    Logic:
      - strip leading b
      - strip quotes/spaces
      - D/W/M/Y -> Day/Week/Month/Year
      - if nothing/other -> 'Missing'
    """
    if val is None:
        return "Missing"

    s = str(val).strip()
    if s == "" or s.lower() in {"< blank >", ".", "na", "nan"}:
        return "Missing"

    # drop leading b or B if present
    if s and s[0].lower() == "b":
        s = s[1:].strip()

    # strip surrounding quotes and spaces
    s = s.strip("'\"").strip()

    if s == "":
        return "Missing"

    s = s.upper()
    if s == "D":
        return "Day"
    if s == "W":
        return "Week"
    if s == "M":
        return "Month"
    if s == "Y":
        return "Year"

    return "Missing"


# ---------- Apply mappings to correct columns ----------

# freq_moderate_ltpa (PAD790Q: 0–180, 7777, 9999)
if "freq_moderate_ltpa" in df.columns:
    df["freq_moderate_ltpa"] = df["freq_moderate_ltpa"].apply(lambda v: map_freq(v, 180))

# moderate_ltpa_unit (PAD790U)
if "moderate_ltpa_unit" in df.columns:
    df["moderate_ltpa_unit"] = df["moderate_ltpa_unit"].apply(map_unit)

# minutes_moderate_ltpa (PAD800: 1–720, 7777, 9999)
if "minutes_moderate_ltpa" in df.columns:
    df["minutes_moderate_ltpa"] = df["minutes_moderate_ltpa"].apply(lambda v: map_minutes(v, 720))

# freq_vigorous_ltpa (PAD810Q: 0–200, 7777, 9999)
if "freq_vigorous_ltpa" in df.columns:
    df["freq_vigorous_ltpa"] = df["freq_vigorous_ltpa"].apply(lambda v: map_freq(v, 200))

# vigorous_ltpa_unit (PAD810U)
if "vigorous_ltpa_unit" in df.columns:
    df["vigorous_ltpa_unit"] = df["vigorous_ltpa_unit"].apply(map_unit)

# minutes_vigorous_ltpa (PAD820: 1–900, 7777, 9999)
if "minutes_vigorous_ltpa" in df.columns:
    df["minutes_vigorous_ltpa"] = df["minutes_vigorous_ltpa"].apply(lambda v: map_minutes(v, 900))

# minutes_sedentary_activity (PAD680: 0–1380, 7777, 9999)
if "minutes_sedentary_activity" in df.columns:
    df["minutes_sedentary_activity"] = df["minutes_sedentary_activity"].apply(lambda v: map_minutes(v, 1380))

# Safety: if any NA slipped through in unit columns, mark as 'Missing'
for col in ["moderate_ltpa_unit", "vigorous_ltpa_unit"]:
    if col in df.columns:
        df[col] = df[col].fillna("Missing")

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/PAQ_L_meaningful_labeled.csv


In [20]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv  = os.path.join(raw_dir, "SMQ_L.csv")
out_csv = os.path.join(out_dir, "SMQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":      "respondent_id",
    "SMQ020":    "smoked_100_cigs_life",
    "SMQ040":    "current_smoking_status",
    "SMD641":    "days_smoked_past_30d",
    "SMD650":    "avg_cigs_per_day_past_30d",
    "SMD100MN":  "cigarette_menthol_indicator",
    "SMQ621":    "cigs_smoked_entire_life",
    "SMD630":    "age_first_whole_cigarette",
    "SMAQUEX2":  "smoking_questionnaire_mode_flag"
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/SMQ_L_meaningful.csv


In [21]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "SMQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "SMQ_L_meaningful_labeled.csv")

# Read as strings so we can handle dots / blanks / weird values safely
df = pd.read_csv(in_csv, dtype=str)

# ---------- Generic helpers ----------

def is_missing(val: str) -> bool:
    if val is None:
        return True
    s = str(val).strip()
    return s == "" or s == "." or s.lower() in {"na", "nan"}

def to_int(val):
    if val is None:
        return None
    s = str(val).strip()
    if s == "" or s == ".":
        return None
    try:
        # Handles plain ints, "1.0", etc.
        return int(float(s))
    except ValueError:
        return None

# ---------- SMQ020: Smoked at least 100 cigarettes in life ----------

def map_smq020(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "smoked_100_cigs_life" in df.columns:
    df["smoked_100_cigs_life"] = df["smoked_100_cigs_life"].apply(map_smq020)

# ---------- SMQ040: Do you now smoke cigarettes? ----------

def map_smq040(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Every day",
        2: "Some days",
        3: "Not at all",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "current_smoking_status" in df.columns:
    df["current_smoking_status"] = df["current_smoking_status"].apply(map_smq040)

# ---------- SMD641: # days smoked cigs during past 30 days (0–30, 77, 99) ----------

def map_smd641(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v is None:
        return "missing"
    if 0 <= v <= 30:
        return v
    if v == 77:
        return "Refused"
    if v == 99:
        return "Don't know"
    return "missing"

if "days_smoked_past_30d" in df.columns:
    df["days_smoked_past_30d"] = df["days_smoked_past_30d"].apply(map_smd641)

# ---------- SMD650: Avg # cigarettes/day during past 30 days ----------

def map_smd650(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v is None:
        return "missing"
    if 2 <= v <= 90:
        return v
    if v == 1:
        return "1 cigarette or less"
    if v == 95:
        return "95 cigarettes or more"
    if v == 777:
        return "Refused"
    if v == 999:
        return "Don't know"
    return "missing"

if "avg_cigs_per_day_past_30d" in df.columns:
    df["avg_cigs_per_day_past_30d"] = df["avg_cigs_per_day_past_30d"].apply(map_smd650)

# ---------- SMD100MN: Cigarette Menthol indicator ----------

def map_smd100mn(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        0: "Non-menthol",
        1: "Menthol",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "cigarette_menthol_indicator" in df.columns:
    df["cigarette_menthol_indicator"] = df["cigarette_menthol_indicator"].apply(map_smd100mn)

# ---------- SMQ621: Cigarettes smoked in entire life ----------

def map_smq621(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Never smoked, not even a puff",
        2: "1+ puffs but never a whole cigarette",
        3: "1 cigarette",
        4: "2 to 5 cigarettes",
        5: "6 to 15 cigarettes",
        6: "16 to 25 cigarettes",
        7: "26 to 99 cigarettes",
        8: "100 or more cigarettes",
        77: "Refused",
        99: "Don't know"
    }.get(v, "missing")

if "cigs_smoked_entire_life" in df.columns:
    df["cigs_smoked_entire_life"] = df["cigs_smoked_entire_life"].apply(map_smq621)

# ---------- SMD630: Age first smoked whole cigarette ----------

def map_smd630(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v is None:
        return "missing"
    # Codebook: 6 = "6 years or less", 9–16 = reported age
    if v == 6:
        return "6 years or less"
    if 9 <= v <= 16:
        return v
    if v == 77:
        return "Refused"
    if v == 99:
        return "Don't know"
    return "missing"

if "age_first_whole_cigarette" in df.columns:
    df["age_first_whole_cigarette"] = df["age_first_whole_cigarette"].apply(map_smd630)

# ---------- SMAQUEX2: Questionnaire Mode Flag ----------

def map_smaquex2(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Home Interview (18+ Yrs)",
        2: "ACASI (12–17 Yrs)"
    }.get(v, "missing")

if "smoking_questionnaire_mode_flag" in df.columns:
    df["smoking_questionnaire_mode_flag"] = df["smoking_questionnaire_mode_flag"].apply(map_smaquex2)

# ---------- Ensure all remaining blanks / '.' -> 'missing' ----------
for col in df.columns:
    df[col] = df[col].apply(lambda x: "missing" if is_missing(x) else x)

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/SMQ_L_meaningful_labeled.csv


In [22]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv  = os.path.join(raw_dir, "DIQ_L.csv")
out_csv = os.path.join(out_dir, "DIQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":    "respondent_id",
    "DIQ010":  "doctor_told_diabetes",
    "DID040":  "age_first_told_diabetes",
    "DIQ159":  "check_item_diq159",
    "DIQ160":  "ever_told_prediabetes",
    "DIQ180":  "blood_test_past_3yrs",
    "DIQ050":  "taking_insulin_now",
    "DID060":  "duration_taking_insulin",
    "DIQ060U": "duration_insulin_unit",
    "DIQ065":  "check_item_diq065",
    "DIQ070":  "taking_diabetes_pills"
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/DIQ_L_meaningful.csv


In [23]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "DIQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "DIQ_L_meaningful_labeled.csv")

# Read as strings for safe cleaning ('.', blanks, etc.)
df = pd.read_csv(in_csv, dtype=str)

# ---------- Helpers ----------

def is_missing(val):
    if val is None:
        return True
    s = str(val).strip()
    return s == "" or s == "." or s.lower() in {"na", "nan"}

def to_int(val):
    if val is None:
        return None
    s = str(val).strip()
    if s == "" or s == ".":
        return None
    try:
        return int(float(s))
    except ValueError:
        return None

# ---------- DIQ010: Doctor told you have diabetes ----------

def map_diq010(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        3: "Borderline",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "doctor_told_diabetes" in df.columns:
    df["doctor_told_diabetes"] = df["doctor_told_diabetes"].apply(map_diq010)

# ---------- DID040: Age when first told you had diabetes ----------

def map_did040(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v is None:
        return "missing"
    if 1 <= v <= 79:
        return v
    if v == 80:
        return "80 years or older"
    if v == 666:
        return "Less than 1 year"
    if v == 777:
        return "Refused"
    if v == 999:
        return "Don't know"
    return "missing"

if "age_first_told_diabetes" in df.columns:
    df["age_first_told_diabetes"] = df["age_first_told_diabetes"].apply(map_did040)

# ---------- DIQ159: CHECK ITEM ----------
# Keep any non-missing value as-is, mark blanks/dots as 'missing'.

def map_check_item(val):
    if is_missing(val):
        return "missing"
    return str(val).strip()

if "check_item_diq159" in df.columns:
    df["check_item_diq159"] = df["check_item_diq159"].apply(map_check_item)

# ---------- DIQ160: Ever told you have prediabetes ----------

def map_diq160(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "ever_told_prediabetes" in df.columns:
    df["ever_told_prediabetes"] = df["ever_told_prediabetes"].apply(map_diq160)

# ---------- DIQ180: Had blood tested past three years ----------

def map_diq180(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "blood_test_past_3yrs" in df.columns:
    df["blood_test_past_3yrs"] = df["blood_test_past_3yrs"].apply(map_diq180)

# ---------- DIQ050: Taking insulin now ----------

def map_diq050(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "taking_insulin_now" in df.columns:
    df["taking_insulin_now"] = df["taking_insulin_now"].apply(map_diq050)

# ---------- DID060: How long taking insulin ----------

def map_did060(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v is None:
        return "missing"
    if 1 <= v <= 60:
        return v
    if v == 666:
        return "Less than 1 month"
    if v == 777:
        return "Refused"
    if v == 999:
        return "Don't know"
    return "missing"

if "duration_taking_insulin" in df.columns:
    df["duration_taking_insulin"] = df["duration_taking_insulin"].apply(map_did060)

# ---------- DIQ060U: Unit of measure (month/year) ----------

def map_diq060u(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Months",
        2: "Years"
    }.get(v, "missing")

if "duration_insulin_unit" in df.columns:
    df["duration_insulin_unit"] = df["duration_insulin_unit"].apply(map_diq060u)

# ---------- DIQ065: CHECK ITEM ----------

if "check_item_diq065" in df.columns:
    df["check_item_diq065"] = df["check_item_diq065"].apply(map_check_item)

# ---------- DIQ070: Take diabetic pills to lower blood sugar ----------

def map_diq070(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "taking_diabetes_pills" in df.columns:
    df["taking_diabetes_pills"] = df["taking_diabetes_pills"].apply(map_diq070)

# ---------- Final sweep: any leftover blanks / '.' -> 'missing' ----------

for col in df.columns:
    df[col] = df[col].apply(lambda x: "missing" if is_missing(x) else x)

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/DIQ_L_meaningful_labeled.csv


In [24]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv  = os.path.join(raw_dir, "MCQ_L.csv")
out_csv = os.path.join(out_dir, "MCQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":     "respondent_id",

    "MCQ010":   "ever_told_asthma",
    "MCQ035":   "still_have_asthma",
    "MCQ040":   "asthma_attack_past_year",
    "MCQ050":   "asthma_er_visit_past_year",

    "AGQ030":   "hay_fever_episode_past_year",

    "MCQ053":   "anemia_treatment_past_3mo",

    "MCQ145":   "check_item_mcq145",
    "MCQ149":   "menstruation_started",
    "MCQ157":   "check_item_mcq157",

    "MCQ160a":  "ever_told_arthritis",
    "MCQ195":   "arthritis_type",

    "MCQ160b":  "ever_told_congestive_heart_failure",
    "MCQ160c":  "ever_told_coronary_heart_disease",
    "MCQ160d":  "ever_told_angina",
    "MCQ160e":  "ever_told_heart_attack",
    "MCQ160f":  "ever_told_stroke",

    "MCQ160m":  "ever_told_thyroid_problem",
    "MCQ170m":  "still_have_thyroid_problem",

    "MCQ160p":  "ever_told_copd_emphysema_chronic_bronchitis",

    "MCQ160l":  "ever_told_liver_condition",
    "MCQ170l":  "still_have_liver_condition",
    "MCQ500":   "ever_told_liver_condition_mcq500",

    "MCQ510a":  "liver_condition_fatty_liver",
    "MCQ510b":  "liver_condition_liver_fibrosis",
    "MCQ510c":  "liver_condition_liver_cirrhosis",
    "MCQ510d":  "liver_condition_viral_hepatitis",
    "MCQ510e":  "liver_condition_autoimmune_hepatitis",
    "MCQ510f":  "liver_condition_other",

    "MCQ515":   "check_item_mcq515",

    "MCQ550":   "ever_told_gallstones",
    "MCQ560":   "ever_had_gallbladder_surgery",

    "MCQ220":   "ever_told_cancer_malignancy",
    "MCQ230a":  "first_cancer_type",
    "MCQ230b":  "second_cancer_type",
    "MCQ230c":  "third_cancer_type",
    "MCQ230d":  "more_than_three_cancer_types",

    "OSQ230":   "metal_objects_in_body"
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/MCQ_L_meaningful.csv


In [25]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "MCQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "MCQ_L_meaningful_labeled.csv")

# Read as strings to safely handle dots/blanks
df = pd.read_csv(in_csv, dtype=str)

# ---------- Helpers ----------

def is_missing(val):
    if val is None:
        return True
    s = str(val).strip()
    return s == "" or s == "." or s.lower() in {"na", "nan"}

def to_int(val):
    if val is None:
        return None
    s = str(val).strip()
    if s == "" or s == ".":
        return None
    try:
        return int(float(s))
    except ValueError:
        return None

def ynrd(val):
    """1 Yes, 2 No, 7 Refused, 9 Don't know, else missing."""
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

def check_item(val):
    """Keep as-is if non-missing, else 'missing'."""
    if is_missing(val):
        return "missing"
    return str(val).strip()

# ---------- Asthma & hay fever ----------

if "ever_told_asthma" in df.columns:
    df["ever_told_asthma"] = df["ever_told_asthma"].apply(ynrd)

if "still_have_asthma" in df.columns:
    df["still_have_asthma"] = df["still_have_asthma"].apply(ynrd)

if "asthma_attack_past_year" in df.columns:
    df["asthma_attack_past_year"] = df["asthma_attack_past_year"].apply(ynrd)

if "asthma_er_visit_past_year" in df.columns:
    df["asthma_er_visit_past_year"] = df["asthma_er_visit_past_year"].apply(ynrd)

if "hay_fever_episode_past_year" in df.columns:
    df["hay_fever_episode_past_year"] = df["hay_fever_episode_past_year"].apply(ynrd)

if "anemia_treatment_past_3mo" in df.columns:
    df["anemia_treatment_past_3mo"] = df["anemia_treatment_past_3mo"].apply(ynrd)

# ---------- Check items MCQ145, MCQ157 ----------

if "check_item_mcq145" in df.columns:
    df["check_item_mcq145"] = df["check_item_mcq145"].apply(check_item)

if "check_item_mcq157" in df.columns:
    df["check_item_mcq157"] = df["check_item_mcq157"].apply(check_item)

# ---------- MCQ149: Menstrual periods started yet? ----------

def map_mcq149(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "menstruation_started" in df.columns:
    df["menstruation_started"] = df["menstruation_started"].apply(map_mcq149)

# ---------- Arthritis & CVD group MCQ160x / MCQ195 ----------

if "ever_told_arthritis" in df.columns:
    df["ever_told_arthritis"] = df["ever_told_arthritis"].apply(ynrd)

def map_mcq195(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Osteoarthritis or degenerative arthritis",
        2: "Rheumatoid arthritis",
        3: "Psoriatic arthritis",
        4: "Other",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "arthritis_type" in df.columns:
    df["arthritis_type"] = df["arthritis_type"].apply(map_mcq195)

for col in [
    "ever_told_congestive_heart_failure",
    "ever_told_coronary_heart_disease",
    "ever_told_angina",
    "ever_told_heart_attack",
    "ever_told_stroke",
    "ever_told_thyroid_problem",
    "ever_told_copd_emphysema_chronic_bronchitis",
    "ever_told_liver_condition",
    "ever_told_liver_condition_mcq500",
    "ever_told_gallstones",
]:
    if col in df.columns:
        df[col] = df[col].apply(ynrd)

# ---------- Still-have conditions MCQ170m / MCQ170l ----------

if "still_have_thyroid_problem" in df.columns:
    df["still_have_thyroid_problem"] = df["still_have_thyroid_problem"].apply(ynrd)

if "still_have_liver_condition" in df.columns:
    df["still_have_liver_condition"] = df["still_have_liver_condition"].apply(ynrd)

# ---------- Liver condition types MCQ510a–f ----------

def map_liver_510(val, code, label):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    if v == code:
        return label
    if v in (77, 99):
        # Only MCQ510a has explicit 77/99; others effectively presence/absence.
        return {77: "Refused", 99: "Don't know"}[v]
    return "missing"

if "liver_condition_fatty_liver" in df.columns:
    df["liver_condition_fatty_liver"] = df["liver_condition_fatty_liver"].apply(
        lambda v: map_liver_510(v, 1, "Fatty liver")
    )

if "liver_condition_liver_fibrosis" in df.columns:
    df["liver_condition_liver_fibrosis"] = df["liver_condition_liver_fibrosis"].apply(
        lambda v: map_liver_510(v, 2, "Liver fibrosis")
    )

if "liver_condition_liver_cirrhosis" in df.columns:
    df["liver_condition_liver_cirrhosis"] = df["liver_condition_liver_cirrhosis"].apply(
        lambda v: map_liver_510(v, 3, "Liver cirrhosis")
    )

if "liver_condition_viral_hepatitis" in df.columns:
    df["liver_condition_viral_hepatitis"] = df["liver_condition_viral_hepatitis"].apply(
        lambda v: map_liver_510(v, 4, "Viral hepatitis")
    )

if "liver_condition_autoimmune_hepatitis" in df.columns:
    df["liver_condition_autoimmune_hepatitis"] = df["liver_condition_autoimmune_hepatitis"].apply(
        lambda v: map_liver_510(v, 5, "Autoimmune hepatitis")
    )

if "liver_condition_other" in df.columns:
    df["liver_condition_other"] = df["liver_condition_other"].apply(
        lambda v: map_liver_510(v, 6, "Other liver disease")
    )

# ---------- MCQ515: CHECK ITEM ----------

if "check_item_mcq515" in df.columns:
    df["check_item_mcq515"] = df["check_item_mcq515"].apply(check_item)

# ---------- Gallstones & surgery ----------

if "ever_had_gallbladder_surgery" in df.columns:
    df["ever_had_gallbladder_surgery"] = df["ever_had_gallbladder_surgery"].apply(ynrd)

# ---------- Cancer ever ----------

if "ever_told_cancer_malignancy" in df.columns:
    df["ever_told_cancer_malignancy"] = df["ever_told_cancer_malignancy"].apply(ynrd)

# ---------- Cancer type mappings (MCQ230a/b/c) ----------

CANCER_TYPE_MAP = {
    10: "Bladder",
    11: "Blood",
    12: "Bone",
    13: "Brain",
    14: "Breast",
    15: "Cervix (cervical)",
    16: "Colon",
    17: "Esophagus (esophageal)",
    18: "Gallbladder",
    19: "Kidney",
    20: "Larynx / windpipe",
    21: "Leukemia",
    22: "Liver",
    23: "Lung",
    24: "Lymphoma / Hodgkin's disease",
    25: "Melanoma",
    26: "Mouth / tongue / lip",
    27: "Nervous system",
    28: "Ovary (ovarian)",
    29: "Pancreas (pancreatic)",
    30: "Prostate",
    31: "Rectum (rectal)",
    32: "Skin (non-melanoma)",
    33: "Skin (unknown type)",
    34: "Soft tissue (muscle or fat)",
    35: "Stomach",
    36: "Testis (testicular)",
    37: "Thyroid",
    38: "Uterus (uterine)",
    39: "Other",
    66: "More than 3 kinds",
    77: "Refused",
    99: "Don't know"
}

def map_cancer_type(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return CANCER_TYPE_MAP.get(v, "missing")

for col in ["first_cancer_type", "second_cancer_type", "third_cancer_type"]:
    if col in df.columns:
        df[col] = df[col].apply(map_cancer_type)

# ---------- MCQ230d: More than 3 kinds of cancer ----------

def map_mcq230d(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        66: "More than 3 kinds",
        77: "Refused",
        99: "Don't know"
    }.get(v, "missing")

if "more_than_three_cancer_types" in df.columns:
    df["more_than_three_cancer_types"] = df["more_than_three_cancer_types"].apply(map_mcq230d)

# ---------- OSQ230: Any metal objects inside your body? ----------

if "metal_objects_in_body" in df.columns:
    df["metal_objects_in_body"] = df["metal_objects_in_body"].apply(ynrd)

# ---------- Final sweep: ensure all blanks/dots -> 'missing' ----------

for col in df.columns:
    df[col] = df[col].apply(lambda x: "missing" if is_missing(x) else x)

# ---------- Save ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/MCQ_L_meaningful_labeled.csv


In [26]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv  = os.path.join(raw_dir, "BPQ_L.csv")
out_csv = os.path.join(out_dir, "BPQ_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":     "respondent_id",
    "BPQ020":   "ever_told_high_blood_pressure",
    "BPQ030":   "told_high_bp_2plus_times",
    "BPQ150":   "taking_bp_meds",
    "BPQ080":   "ever_told_high_cholesterol",
    "BPQ101D":  "taking_cholesterol_meds"
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/BPQ_L_meaningful.csv


In [27]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "BPQ_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "BPQ_L_meaningful_labeled.csv")

# ---------- Load data ----------
df = pd.read_csv(in_csv, dtype=str)

# ---------- Helpers ----------

def is_missing(val):
    if val is None:
        return True
    s = str(val).strip()
    return s == "" or s == "." or s.lower() in {"na", "nan"}

def to_int(val):
    if val is None:
        return None
    s = str(val).strip()
    if s == "" or s == ".":
        return None
    try:
        return int(float(s))
    except ValueError:
        return None

def ynrd(val):
    """Generic Yes/No/Refused/Don't know mapping"""
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

# ---------- BPQ020: Ever told you had high blood pressure ----------
if "ever_told_high_blood_pressure" in df.columns:
    df["ever_told_high_blood_pressure"] = df["ever_told_high_blood_pressure"].apply(ynrd)

# ---------- BPQ030: Told had high blood pressure - 2+ times ----------
if "told_high_bp_2plus_times" in df.columns:
    df["told_high_bp_2plus_times"] = df["told_high_bp_2plus_times"].apply(ynrd)

# ---------- BPQ150: Taking high blood pressure medication ----------
if "taking_bp_meds" in df.columns:
    df["taking_bp_meds"] = df["taking_bp_meds"].apply(ynrd)

# ---------- BPQ080: Doctor told you - high cholesterol level ----------
if "ever_told_high_cholesterol" in df.columns:
    df["ever_told_high_cholesterol"] = df["ever_told_high_cholesterol"].apply(ynrd)

# ---------- BPQ101D: Taking meds to lower blood cholesterol ----------
if "taking_cholesterol_meds" in df.columns:
    df["taking_cholesterol_meds"] = df["taking_cholesterol_meds"].apply(ynrd)

# ---------- Final cleanup: replace blanks/dots -> 'missing' ----------
for col in df.columns:
    df[col] = df[col].apply(lambda x: "missing" if is_missing(x) else x)

# ---------- Save labeled file ----------
df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/BPQ_L_meaningful_labeled.csv


In [28]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

# ---------- Paths ----------
raw_dir = "/content/drive/MyDrive/MSML610 Project/CSV"
out_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
os.makedirs(out_dir, exist_ok=True)

in_csv  = os.path.join(raw_dir, "RXQ_RX_L.csv")
out_csv = os.path.join(out_dir, "RXQ_RX_L_meaningful.csv")

# ---------- Map: NHANES code -> meaningful column name ----------
CODE_TO_NAME = {
    "SEQN":    "respondent_id",
    "RXQ033":  "took_prescription_past_month",
    "RXQ050":  "num_prescription_medicines_taken"
}

# ---------- Load, rename, save ----------
df = pd.read_csv(in_csv)
df.rename(columns=CODE_TO_NAME, inplace=True)

df.to_csv(out_csv, index=False)
print(f"Saved renamed file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved renamed file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/RXQ_RX_L_meaningful.csv


In [29]:
import os
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

base_dir = "/content/drive/MyDrive/MSML610 Project/CSV_meaningful"
in_csv   = os.path.join(base_dir, "RXQ_RX_L_meaningful.csv")
out_csv  = os.path.join(base_dir, "RXQ_RX_L_meaningful_labeled.csv")

df = pd.read_csv(in_csv, dtype=str)


def is_missing(val):
    if val is None:
        return True
    s = str(val).strip()
    return s == "" or s == "." or s.lower() in {"na", "nan"}

def to_int(val):
    if val is None:
        return None
    s = str(val).strip()
    if s == "" or s == ".":
        return None
    try:
        return int(float(s))
    except ValueError:
        return None

def ynrd(val):
    """Generic Yes/No/Refused/Don't know mapping"""
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "Yes",
        2: "No",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")


if "took_prescription_past_month" in df.columns:
    df["took_prescription_past_month"] = df["took_prescription_past_month"].apply(ynrd)


def map_rxq050(val):
    if is_missing(val):
        return "missing"
    v = to_int(val)
    return {
        1: "1 medicine",
        2: "2 medicines",
        3: "3 medicines",
        4: "4 medicines",
        5: "5 or more medicines",
        7: "Refused",
        9: "Don't know"
    }.get(v, "missing")

if "num_prescription_medicines_taken" in df.columns:
    df["num_prescription_medicines_taken"] = df["num_prescription_medicines_taken"].apply(map_rxq050)

for col in df.columns:
    df[col] = df[col].apply(lambda x: "missing" if is_missing(x) else x)

df.to_csv(out_csv, index=False)
print(f"Saved labeled file to: {out_csv}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved labeled file to: /content/drive/MyDrive/MSML610 Project/CSV_meaningful/RXQ_RX_L_meaningful_labeled.csv
