In [1]:
import pickle
from collections import defaultdict
import os
import subprocess
import yaml
from yaml import CLoader as Loader
import requests
import pandas as pd
import plotly.express as px
from plotly.offline import plot
from plotly.subplots import make_subplots
from sklearn.utils import shuffle
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

(!) Нужно создать .yaml файлы с нужными годами и признаками в папке YAMLs в той же директории, где лежит этот скрипт

In [2]:
# Эта ячейка из .yaml в YAMLs, скачивает сырые .xpt в NhanesRawData и выдает .pkl в PKLs
# В PKLs только нужные признаки/параметры, но не отфильтровываются NA

def yamls2pkls(dir="YAMLs"):
    for yaml_file in os.listdir(dir):
        nhanes_from_yaml(yaml_file, yaml_file[0:5])

def nhanes_from_yaml(yaml_file, name: "years of nhanes period"):
    """
    ONLY ONE PERIOD FROM ONE YAML
    """
    df = pd.DataFrame()
    ddict = defaultdict()
    stream = open(f"YAMLs/{yaml_file}", "r")
    try:
        for doc in yaml.load_all(stream, Loader):
            for period, files in doc.items():
                for file_name, params in files.items():

                    # create dir if doesn't exist
                    if not os.path.exists("NhanesRawData"):
                        os.mkdir("NhanesRawData")
                    if not os.path.exists("PKLs"):
                        os.mkdir("PKLs")

                    # download nhanes file if not on disk
                    if not os.path.isfile(f"NhanesRawData/{file_name}.xpt"):
                        url = f"https://wwwn.cdc.gov/Nchs/Nhanes/{period}/{file_name}.XPT"
                        r = requests.get(url, allow_redirects=True)
                        open(f"NhanesRawData/{file_name}.xpt", 'wb').write(r.content)

                    # read dataframe from file, filter and rename
                    new_data = pd.read_sas(f"NhanesRawData/{file_name}.xpt", format='xport')
                    new_data = new_data[list(params["params"].keys())] # filter only needed data
                    new_data.rename(params["params"], axis=1, inplace=True)
                    if params["values"]:
                        new_data.replace(params["values"], inplace=True)

                    ddict[period] = ddict.setdefault(period, new_data).merge(new_data, how='inner')
                    df = ddict[period]

    except AttributeError:
        print("Oops! Wrong format of yaml")

    with open(f"PKLs/{name}.pkl", 'wb') as f:
        pickle.dump(df, f)

yamls2pkls()

In [3]:
# Эта ячейка чистит .plk 2019-2020 года от 2017-2018 года

def nhanes_save(df, name):
    with open(f"PKLs/{name}.pkl", 'wb') as f:
        pickle.dump(df, f)

def nhanes_open(period):
    if os.path.isfile(f"PKLs/{period}.pkl"):
        with open(f"PKLs/{period}.pkl", 'rb') as f:
            df = pickle.load(f)
    else:
        raise Exception("Sorry, no file")
    return df

nhanes17 = nhanes_open("17-18")
nhanes19 = nhanes_open("19-20")
nhanes = pd.concat([nhanes19, nhanes17])
subset = list(nhanes.columns)[1::]
nhanes19_clean = nhanes.drop_duplicates(subset=subset, keep=False)
nhanes_save(nhanes19_clean, "19-20")
nhanes19_clean

Unnamed: 0,id,sex,age,race,alt,ast,creatinine,bilirubin,uric_acid,hematocrit,...,eosinophils_perc,basophils_num,basophils_perc,segmented_num,segmented_perc,hemoglobin,red_cell_cnt,rdw,mean_hemoglobin_conc,mean_hemoglobin_pg
1,109266.0,Female,29.0,Non-Hispanic Asian,15.0,14.0,55.69,8.55,291.5,36.5,...,0.8,5.397605e-79,0.5,4.5,58.3,12.3,4.35,14.0,33.6,28.1
11,109290.0,Female,68.0,Non-Hispanic Black,19.0,18.0,61.00,15.39,249.8,44.2,...,1.4,1.000000e-01,0.8,4.7,50.1,14.2,5.33,15.2,32.0,26.5
13,109292.0,Male,58.0,Other Hispanic,21.0,18.0,83.98,6.84,452.0,41.1,...,5.1,1.000000e-01,1.3,5.8,60.8,13.8,4.39,13.2,33.5,31.4
19,109303.0,Male,18.0,Non-Hispanic Asian,13.0,17.0,65.42,6.84,297.4,44.6,...,4.2,1.000000e-01,0.6,4.9,47.5,15.3,5.38,13.6,34.4,28.5
20,109305.0,Male,55.0,Non-Hispanic Asian,,,,,,46.0,...,1.9,5.397605e-79,1.3,2.1,57.9,16.0,5.27,12.8,34.7,30.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10397,124807.0,Male,44.0,Non-Hispanic Asian,16.0,16.0,79.56,8.55,368.8,44.8,...,0.6,5.397605e-79,0.3,4.0,67.2,14.5,4.79,13.6,32.5,30.3
10399,124810.0,Female,56.0,Non-Hispanic Black,24.0,21.0,60.11,8.55,315.2,43.6,...,0.7,5.397605e-79,0.5,6.4,74.6,14.4,5.14,14.0,33.1,28.1
10402,124813.0,Female,43.0,Non-Hispanic Black,13.0,14.0,59.23,6.84,273.6,38.5,...,2.0,5.397605e-79,0.9,3.0,58.0,12.6,4.77,14.3,32.8,26.4
10403,124814.0,Male,64.0,Non-Hispanic Black,14.0,16.0,86.63,5.13,321.2,33.7,...,5.6,2.000000e-01,2.2,4.7,57.0,10.4,4.24,19.7,30.7,24.5


In [4]:
# Эта ячейка открывает все .pkl из PKLs и возвращает собранный df (dataframe)

def nhanes_concat(dir="PKLs"): # maybe not working
    nhanes_list = []
    for file in os.listdir(dir):
        if os.path.isfile(f"PKLs/{file}"):
            with open(f"PKLs/{file}", 'rb') as f:
                df = pickle.load(f)
                df["period"] = int(file[0:2])
        else:
            raise Exception("Sorry, no file")
        nhanes_list.append(df)
    nhanes = pd.concat(nhanes_list, join="inner", ignore_index=True)
    return nhanes

nhanes = nhanes_concat()
nhanes

Unnamed: 0,id,sex,age,race,alt,ast,creatinine,bilirubin,uric_acid,hematocrit,...,basophils_num,basophils_perc,segmented_num,segmented_perc,hemoglobin,red_cell_cnt,rdw,mean_hemoglobin_conc,mean_hemoglobin_pg,period
0,73557.0,Male,69.0,Non-Hispanic Black,16.0,16.0,106.96,13.68,196.3,45.4,...,1.000000e-01,1.2,2.0,42.3,15.2,5.09,14.0,33.4,29.9,13
1,73558.0,Male,54.0,Non-Hispanic White,29.0,18.0,69.84,15.39,279.6,36.7,...,1.000000e-01,0.6,7.4,58.4,11.9,3.84,13.4,32.5,31.0,13
2,73559.0,Male,72.0,Non-Hispanic White,16.0,22.0,107.85,10.26,339.0,49.9,...,1.000000e-01,0.9,4.9,68.2,17.2,5.53,13.4,34.3,31.1,13
3,73561.0,Female,73.0,Non-Hispanic White,28.0,36.0,64.53,8.55,249.8,43.8,...,1.000000e-01,1.4,4.5,68.7,14.5,4.72,12.3,33.0,30.6,13
4,73562.0,Male,56.0,Mexican American,16.0,24.0,78.68,8.55,541.3,41.5,...,1.000000e-01,0.9,6.5,69.2,14.2,4.93,13.5,34.2,28.8,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23934,124807.0,Male,44.0,Non-Hispanic Asian,16.0,16.0,79.56,8.55,368.8,44.8,...,5.397605e-79,0.3,4.0,67.2,14.5,4.79,13.6,32.5,30.3,19
23935,124810.0,Female,56.0,Non-Hispanic Black,24.0,21.0,60.11,8.55,315.2,43.6,...,5.397605e-79,0.5,6.4,74.6,14.4,5.14,14.0,33.1,28.1,19
23936,124813.0,Female,43.0,Non-Hispanic Black,13.0,14.0,59.23,6.84,273.6,38.5,...,5.397605e-79,0.9,3.0,58.0,12.6,4.77,14.3,32.8,26.4,19
23937,124814.0,Male,64.0,Non-Hispanic Black,14.0,16.0,86.63,5.13,321.2,33.7,...,2.000000e-01,2.2,4.7,57.0,10.4,4.24,19.7,30.7,24.5,19


In [5]:
# Эта ячейка строит PCA и heatmap для mean и median, сохраняет и открывает в браузере

def get_pca_components(nhanes):
    nhanes = nhanes.dropna()
    #df = shuffle(df) # чтобы точки накладывались друг на друга равномерно ТОЛЬКО ДЛЯ МНОЖЕСТВА ТОЧЕК
    features = list(nhanes.columns)[4:-1] # выделяем фичи из колонок БЕЗ ГОДА
    X = nhanes[features] # отделяем фичи
    scaler = StandardScaler() # стандартизуем
    X = scaler.fit_transform(X) # стандартизуем
    pca = PCA(n_components=2)
    components = pca.fit_transform(X)
    components = pd.DataFrame(components)
    components["period"] = nhanes["period"]
    X = pd.DataFrame(X)
    return components, X

def save_and_open_fig(figures, file="out.html"):
    if not os.path.exists("Output_figs"):
        os.mkdir("Output_figs")
        
    with open(f"Output_figs/{file}", 'a') as f:
        for fig in figures:
            f.write(fig.to_html(full_html=False, include_plotlyjs='cdn'))

    try:
        os.startfile(f"Output_figs/{file}")
    except AttributeError:
        try:
            subprocess.call(['open', f"Output_figs/{file}"])
        except:
            raise Exception('Could not open file')
    
components, X = get_pca_components(nhanes)

# fig = make_subplots(rows=4, subplot_titles=("Means of pca by period","Medians of pca by period","Means heatmap","Medians heatmap"))

title="Means of pca by period"
mean_period_df = components.groupby(["period"]).mean()
mean_period_df.index = mean_period_df.index.map(int).map(str)
fig1 = px.scatter(mean_period_df, x=0, y=1, color=mean_period_df.index, title=title)

title="Medians of pca by period"
median_period_df = components.groupby(["period"]).median()
median_period_df.index = median_period_df.index.map(int).map(str)
fig2 = px.scatter(median_period_df, x=0, y=1, color=median_period_df.index, title=title)

title="Means heatmap"
features = list(nhanes.columns)[4:-1]
ddict_features = {}
for i, x in enumerate(features):
    ddict_features[i] = x
X = X.rename(columns=ddict_features)
X['period'] = nhanes['period']
mean_features = X.groupby('period').mean()
fig3 = px.imshow(mean_features, x=mean_features.columns, y=mean_features.index, title=title)
fig3.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = ["13", "15", "17", "19"],
        ticktext = ["13-14", "15-16", "17-18", "19-20"]
    )
)

title="Medians heatmap"
features = list(nhanes.columns)[4:-1]
ddict_features = {}
for i, x in enumerate(features):
    ddict_features[i] = x
X = X.rename(columns=ddict_features)
X['period'] = nhanes['period']
median_features = X.groupby('period').median()
fig4 = px.imshow(median_features, x=median_features.columns, y=median_features.index, title=title)
fig4.update_layout(
    yaxis = dict(
        tickmode = 'array',
        tickvals = ["13", "15", "17", "19"],
        ticktext = ["13-14", "15-16", "17-18", "19-20"]
    )
)


figures = [fig1, fig2, fig3, fig4]
save_and_open_fig(figures, file="nhanes.html")