This exists just to pull together some of the essential data and save it to a new csv for more easy imports later

In [1]:
# Import all our necessary libraries

import pandas as pd
import numpy as np


In [8]:
# Load all the initial datasets, and pull in relevant columns
demographics = pd.read_csv("demographic.csv")
diet = pd.read_csv("diet.csv")
examination = pd.read_csv("examination.csv")
labs = pd.read_csv("labs.csv")
# medications = pd.read_csv("medications.csv")
questionnare = pd.read_csv("questionnaire.csv")

In [9]:
# Using some of the findings here to help find relevant columns to pull out
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8487286/

# pull out the relevant columns & put them into a new dataframe
demographics_new = demographics[["SEQN", "RIAGENDR", "RIDAGEYR", "RIDRETH3", "DMDEDUC2", "DMDMARTL", "INDHHIN2"]]
rename_columns = {"SEQN": "id",
                  "RIAGENDR": "gender",
                  "RIDAGEYR": "age_years",
                  "RIDRETH3": "race",
                  "DMDEDUC2": "highest_education_level",
                  "DMDMARTL": "marital_status",
                  "INDHHIN2": "household_income"}
demographics_new = demographics_new.rename(columns=rename_columns)

# might want to pull in DRQSDT1-12 at some point if diet ends up being helpful and we
# want more specificity
diet_new = diet[["SEQN", "DRQSDIET"]]
diet_new = diet_new.rename(columns={"SEQN": "id", "DRQSDIET": "diet_is_on"})

# TODO: probably pull more stuff from examinations
examination_new = examination[["SEQN", "BMXWT", "BMXHT"]]
rename_columns = {
    "SEQN": "id",
    "BMXWT": "weight_kg",
    "BMXHT": "height_cm"
}
examination_new = examination_new.rename(columns=rename_columns)

questionnare_new = questionnare[["SEQN", "CDQ001", "BPQ020", "BPQ040A", "BPQ080", "MCQ080", "MCQ160C", "MCQ160F",
                             "PAD660", "PAD675", "PAQ610", "PAQ625", "PAQ655", "PAQ670", "DIQ010", "DIQ160",
                             "SMD480", "SMQ040", "SMQ020", "SMQ621",
                             "IND235",
                             # "DUQ250", "DUQ280", "DUQ290", "DUQ320", "DUQ330", "DUQ360",
                             "ALQ101", "ALQ120Q"]]
rename_columns = {
    "SEQN": "id",
    "CDQ001": "chest_pain",
    "BPQ020": "high_blood_pressure_diagnosed",
    "BPQ040A": "high_blood_pressure_prescribed_medicine",
    "BPQ050A": "high_blood_pressure_taking_medicine",
    "BPQ080": "high_blood_cholesterol",
    "BPQ090D": "high_blood_cholesterol_prescribed_medicine",
    "BPQ100D": "high_blood_cholesterol_taking_medicine",
    "MCQ080": "overweight_diagnosed",
    "MCQ160C": "coronary_heart_disease",
    "PAD660": "physical_activity_vigorous_time_per_day",
    "PAD675": "physical_activity_moderate_time_per_day",
    "PAQ610": "physical_activity_vigorous_days_per_week",
    "PAQ625": "physical_activity_moderate_days_per_week",
    "PAQ655": "physical_activity_recreational_vigorous_days_per_week",
    "PAQ670": "physical_activity_recreational_moderate_days_per_week",
    "DIQ010": "diabetes_diagnosed",
    "DIQ160": "diabetes_high_blood_sugar_but_not_diabetes",
    "SMD480": "smoking_people_in_household_days_per_week",
    "SMQ040": "smoking_currently_smoking",
    "SMQ020": "smoking_smoked_at_least_100_cigarettes",
    "SMQ621": "smoking_total_number_cigarettes_smoked",
    "IND235": "income_family_monthly",
    # "DUQ250": "drug_use_cocaine",
    # "DUQ280": "drug_use_cocaine_last_30_days",
    # "DUQ290": "drug_use_heroin",
    # "DUQ320": "drug_use_heroin_last_30_days",
    # "DUQ330": "drug_use_meth",
    # "DUQ360": "drug_use_meth_last_30_days",
    "ALQ101": "alcohol_use_more_than_12_drinks_per_year",
    "ALQ120Q": "alcohol_use_how_often",
    "MCQ160F": "LABEL_had_stroke"
}
questionnare_new = questionnare_new.rename(columns=rename_columns)

In [12]:
# save it to a new final dataset
final_dataframe = questionnare_new.merge(demographics_new,
                                         on="id")
final_dataframe = final_dataframe.merge(examination_new,
                                        on="id")
final_dataframe = final_dataframe.merge(diet_new,
                                        on="id")

final_dataframe.info()
final_dataframe.to_csv("../final_data.csv")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9813 entries, 0 to 9812
Data columns (total 32 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   id                                                     9813 non-null   int64  
 1   chest_pain                                             3708 non-null   float64
 2   high_blood_pressure_diagnosed                          6266 non-null   float64
 3   high_blood_pressure_prescribed_medicine                2118 non-null   float64
 4   high_blood_cholesterol                                 6266 non-null   float64
 5   overweight_diagnosed                                   6266 non-null   float64
 6   coronary_heart_disease                                 5588 non-null   float64
 7   LABEL_had_stroke                                       5588 non-null   float64
 8   physical_activity_vigorous_time_per_day         