In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
code_dir=Path.cwd()
project_dir=code_dir.parent
input_dir=project_dir/"input"
output_dir=project_dir/"output"
tmp_dir=project_dir/"tmp"

In [3]:
variables_ukb = [
    "diagnosis_metabolic_syndrome",'base_age', 'base_sex', 'base_isced_aggregated',
    "rf_waist_circumference", "rf_hip_circumference", "rf_waist_hip_ratio", "rf_bmi",
    'rf_systolic_bp', 'rf_diastolic_bp', "medication_antihypertensive",
    "blood_cholesterol", 'blood_hdl', "blood_ldl", 'blood_triglycerides', "medication_lipid_lowering",
    "blood_glucose", "medication_antidiabetic",
    "imaging_cortical_thickness_mean"
]

subject_list_ukb = list(pd.read_csv(input_dir/"subject_list_ukb.csv", index_col=0).index)
y_ukb_df = pd.read_csv(input_dir/"phenotypical_data_ukb.csv", index_col=0)
y_ukb_df = y_ukb_df.loc[subject_list_ukb,variables_ukb]
y_ukb_df.drop_duplicates(inplace=True)

In [5]:
variables_hchs = [
        "diagnosis_metabolic_syndrome","age","sex","base_education_isced",
       "body_waist_circumference", "body_hip_circumference", "body_waist_hip_ratio", "body_bmi",
       "cvrisk_systolic_blood_pressure_mmhg","cvrisk_diastolic_blood_pressure_mmhg","therapy_antihypertensives",
       "blood_hdl_mg_dl", "blood_ldl_mg_dl", "blood_triglycerides_mg_dl", "blood_cholesterol_mg_dl", "therapy_lipid_lowering",
       "blood_glucose","therapy_antidiabetic", 
       "mri_cortical_thickness_mean"
]

y_hchs_df = pd.read_csv(input_dir/"subject_list_hchs.csv", index_col=0)
subject_list_hchs = y_hchs_df.index
y_hchs_df2 = pd.read_csv(input_dir/"y_hchs.csv", index_col=0)
y_hchs_df3 = pd.read_csv(input_dir/"phenotypical_data_hchs.csv", index_col=0)
y_hchs_df = y_hchs_df[["age", "sex", "base_education_isced"]].join(y_hchs_df2, rsuffix="_2").join(y_hchs_df3, rsuffix="_3")
y_hchs_df = y_hchs_df.loc[subject_list_hchs, variables_hchs]
y_hchs_df.columns = y_ukb_df.columns


In [6]:
statistics_df = pd.concat([y_ukb_df, y_hchs_df])

In [7]:
variables = ["diagnosis_metabolic_syndrome",'base_age', 'base_sex', 'base_isced_aggregated',
       "rf_waist_circumference", "rf_hip_circumference", "rf_waist_hip_ratio", "rf_bmi",
       'rf_systolic_bp', 'rf_diastolic_bp', "medication_antihypertensive",
       "blood_cholesterol", 'blood_hdl', "blood_ldl", 'blood_triglycerides', "medication_lipid_lowering",
       "blood_glucose", "medication_antidiabetic",
       "imaging_cortical_thickness_mean"]

variables_categorical = ['diagnosis_metabolic_syndrome', 'base_sex', 
       "medication_antihypertensive","medication_lipid_lowering","medication_antidiabetic",
       ]

variables_continuous = [var for var in variables if var not in variables_categorical]

In [8]:
cognition_variables = [col for col in statistics_df.columns if "cognition" in col]

In [9]:
statistics_df = statistics_df[variables]
statistics_df["base_sex_numeric"] = statistics_df["base_sex"].astype(int)
for col in variables:
    if col in variables_categorical: statistics_df[col] = statistics_df[col].astype("object")
    elif col in variables_continuous: statistics_df[col] = statistics_df[col].astype("float64")

# Descriptive stats tables

In [112]:
from python_functions.descriptive_statistics import stats_1group, finalize_stats_1group # https://csi-hamburg.github.io/python_functions/descriptive_statistics.html

In [None]:
stats = stats_1group(statistics_df, variables); stats

In [None]:
stats_styled = finalize_stats_1group(stats); stats_styled

In [14]:
Path(output_dir/"descriptive_statistics").mkdir(parents=True, exist_ok=True)
stats_styled.to_csv(output_dir/"descriptive_statistics/descriptive_statistics.csv")