## Data Inspection
- Goal: understand quantity and sparsity of data across patients

In [33]:
import pandas as pd
import etl

PATH = etl.TRAIN_PATH

In [20]:
from importlib import reload
reload(etl)

<module 'etl' from '/home/epan/Documents/Github/c19-Hospitalization-Likelihood/src/etl.py'>

In [34]:
# Summary _does not_ include 'Parsed' values
summary_df = etl.generate_concept_summary(PATH)
summary_df

Unnamed: 0,concept_id,unique_pid_count,avg_per_pid,concept_name,from_table
0,44818702,1251,144.921663,,
1,3028553,1246,13.002408,,
2,37208405,1244,14.748392,History of alcohol use,observation
3,3035995,1243,8.670153,Alkaline phosphatase [Enzymatic activity/volum...,measurement
4,3000905,1240,9.941129,Leukocytes [#/volume] in Blood by Automated count,measurement
...,...,...,...,...,...
1506,2765743,1,1.000000,,
1507,2002747,1,1.000000,Other partial resection of small intestine,procedure_occurrence
1508,2765672,1,1.000000,,
1509,2003287,1,1.000000,Endoscopic sphincterotomy and papillotomy,procedure_occurrence


In [35]:
# This Concept-Feature map _does_ include 'Parsed' values
cf_map, corr_series = etl.get_highest_corr_concept_feature_id_map_and_corr_series(PATH)


In [36]:
concept_to_correlation_df = pd.DataFrame(corr_series)
concept_to_correlation_df.insert(1, 'abs_pearson_corr', abs(corr_series))
concept_to_correlation_df = concept_to_correlation_df.reset_index().rename(columns={'index':'concept_id', 'status': 'pearson_corr'})
concept_to_correlation_df.sort_values('abs_pearson_corr', ascending=False)
concept_to_correlation_df

Unnamed: 0,concept_id,pearson_corr,abs_pearson_corr
0,380378,-0.009100,0.009100
1,75909,0.043308,0.043308
2,438409,0.029498,0.029498
3,435875,-0.037932,0.037932
4,80502,0.002094,0.002094
...,...,...,...
2244,3029187111,0.000000,0.000000
2245,3004254111,0.000000,0.000000
2246,3005755111,0.000000,0.000000
2247,4075831011,0.000000,0.000000


## Clustering Analysis - Finding Highest-Separation Feature Combinations
- Goal: find set of features that result in best PCA Clustering
- Pipeline: pick set of features (profile) -> run PCA -> manually view data -> run K-Means/GMM to cluster -> score clusters: use this as predictive measurement

In [38]:
feature_df = etl.create_feature_df(cf_map, path=PATH)
feature_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2239,2240,2241,2242,2243,2244,2245,2246,2247,2248
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
