# Importing Packages

In [2]:
import json
import os

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# import biom
# from qiime2.plugins import feature_table
# from qiime2 import Artifact
# from qiime2.plugins.metadata.methods import distance_matrix


## Importing Data

In [3]:
## Obtaining file paths
with open("config/data-params.json") as fh:
    file_paths = json.load(fh)

In [4]:
feature_table = pd.read_csv('~/private/dsc180b-data/feature_table.csv')
metadata = pd.read_csv('~/private/dsc180b-data/11666_metadata.txt', sep='\t', index_col=0)

  metadata = pd.read_csv('~/private/dsc180b-data/11666_metadata.txt', sep='\t', index_col=0)


# Subset of Metadata

In [97]:
diseases = {
    'abdominal_obesity_ncep_v2': 'obesity',
    'diabetes2_v2': 'diabetes',
    'dyslipidemia_v2': 'dyslipidemia',
    'hypertension2_v2': 'hypertension',
    'ckd2': 'ckd',
    'precvd': 'precvd',
    'elevated_bp_selfmeds_v2': 'elevated_bp',
}
           
other_cols = {
    'age_v2': 'age',
    'center': 'center',
    'gender': 'gender',
    'host_body_mass_index': 'BMI'
}

subset_cols = diseases | other_cols

sub_metadata = metadata[subset_cols.keys()].rename(columns=subset_cols).dropna() 
disease_metadata = metadata[diseases.keys()].rename(columns=diseases).dropna() #only diseases

disease_metadata.head()

Unnamed: 0_level_0,obesity,diabetes,dyslipidemia,hypertension,ckd,precvd,elevated_bp
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11666.BLANK1.1A,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable
11666.BLANK1.1A.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable
11666.BLANK1.1B,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable
11666.BLANK1.1B.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable
11666.BLANK1.1C,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable


In [98]:
for _, col in enumerate(disease_metadata):
    print(disease_metadata[col].unique())

['not applicable' '1' '0' 'not provided' 1.0 0.0]
['not applicable' '2' '3' '1' 'not provided' 2.0 3.0 1.0]
['not applicable' '0' '1' 'not provided' 0.0 1.0]
['not applicable' '1' '0' 'not provided' 1.0 0.0]
['not applicable' '0' '1' 'not provided' 0.0 1.0]
['not applicable' '0' '1' 'not provided' 0.0 1.0]
['not applicable' '1' '0' 'not provided' 1.0 0.0]


In [99]:
## COLUMN MAPS
disease_map = {
    'not applicable': 0,
    'not provided': 0,
    '0': 0,
    '1': 1,
    '2': 1,
    '3': 1,
    0.0: 0,
    1.0: 1,
    2.0: 1,
    3.0: 1
}

disease_metadata = disease_metadata.applymap(lambda x: disease_map[x]) #map values to binary
disease_metadata['total_diseases'] = disease_metadata.sum(axis=1) #how many diseases each sample has
disease_metadata

Unnamed: 0_level_0,obesity,diabetes,dyslipidemia,hypertension,ckd,precvd,elevated_bp,total_diseases
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11666.BLANK1.1A,0,0,0,0,0,0,0,0
11666.BLANK1.1A.ITS,0,0,0,0,0,0,0,0
11666.BLANK1.1B,0,0,0,0,0,0,0,0
11666.BLANK1.1B.ITS,0,0,0,0,0,0,0,0
11666.BLANK1.1C,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
11666.G1777A,0,1,1,1,0,0,1,4
11666.G1778A,1,1,0,1,0,0,1,4
11666.G1779A,1,1,0,1,0,0,1,4
11666.G1780A,1,1,1,1,0,0,1,5


In [101]:
disease_metadata['total_diseases'].value_counts()

4    726
2    712
3    660
0    602
5    525
1    365
6    131
7      5
Name: total_diseases, dtype: int64

Unnamed: 0_level_0,obesity,diabetes,dyslipidemia,hypertension,ckd,precvd,elevated_bp,total_diseases
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11666.BLANK1.1A,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicablenot applicablenot applicablenot ...
11666.BLANK1.1A.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicablenot applicablenot applicablenot ...
11666.BLANK1.1B,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicablenot applicablenot applicablenot ...
11666.BLANK1.1B.ITS,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicablenot applicablenot applicablenot ...
11666.BLANK1.1C,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicable,not applicablenot applicablenot applicablenot ...


In [23]:
clean_na(sub_metadata['abdominal_obesity_idf_v2'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  col[col=='not applicable'] = 'TEMP'


sample_name
11666.BLANK1.1A        TEMP
11666.BLANK1.1A.ITS    TEMP
11666.BLANK1.1B        TEMP
11666.BLANK1.1B.ITS    TEMP
11666.BLANK1.1C        TEMP
                       ... 
11666.G3059L            NaN
11666.G3060R            NaN
11666.G3061L            NaN
11666.G3062L            NaN
11666.G816K             NaN
Name: abdominal_obesity_idf_v2, Length: 5423, dtype: object

In [25]:
for col in :
    print(col, metadata[col].dtype)

abdominal_obesity_ncep_v2 object
ckd_v2 object
diabetes2_v2 object
precvd_v2 object
elevated_bp_selfmeds_v2 object
high_total_chol2_v2 object


In [13]:
col_maps = {'abdominal_obesity_idf_v2':
           }

abdominal_obesity_idf_v2 object
diabetes2_v2 object
dyslipidemia_v2 object
hypertension2_v2 object
ckd2 object
precvd object
elevated_bp_selfmeds_v2 object
age_v2 object
center object
gender object
host_body_mass_index object


In [None]:
numeric_cols = ['abdominal_obesity_idf_v2',
                'age_v2',
                'ckd2',
                'diabetes2_v2',
                'host_body_mass_index',
                'dyslipidemia_v2',
                'elevated_bp_selfmeds_v2',
                'hypertension2_v2',
                'precvd']
categorical_cols = ['center',
                    'gender']

In [None]:
missing_values(metadata['abdominal_obesity_ncep_v2'],'numeric')

## Missing Data Analysis

In [None]:
# Function for single representation for missing values
def missing_values(col):
    
    temp = col.apply(lambda x: np.nan if x in ['not applicable','not provided'] else x)
    if type == 'numeric':
        temp = temp.apply(lambda x: x if pd.isnull(x) else np.float64(x))
    return temp

In [None]:
missing_values(metadata['abdominal_obesity_ncep_v2'],'numeric').value_counts()

In [None]:
missing_values(metadata['abdominal_obesity_ncep_v2'],np.int64).isnull().sum() * 100 / len(metadata) 

In [None]:
# Potential Columns -- more to be added 
# ['abdominal_obesity_idf_v2','abdominal_obesity_ncep_v2','anonymized_name', 'age_v2','bmi_v2','center','ckd2',
# 'diabetes2_v2','dm_aware_v2','dyslipidemia_v2','education_c2_v1','elevated_bp_selfmeds_v2','gender','host_age',
# 'host_body_mass_index','hypertension2_v2','placeofbirth_group','precvd','us_born_v2',]

## Defining missing values

In [None]:
sub_metadata.loc[:,numeric_col] = sub_metadata[numeric_col].apply(lambda x: missing_values(x,'numeric'))
sub_metadata.loc[:,categorical_col] = sub_metadata[categorical_col].apply(lambda x: missing_values(x,'categorical'))

In [None]:
sub_metadata.isnull().sum() * 100 / len(sub_metadata) # calculates percent of nan in column


## EDA on Subset of Metadata

In [None]:
sub_metadata

In [None]:
def create_bar_col_binary(df, col_name):
    ax = df[col_name].fillna(-1).value_counts().sort_index(ascending=False).plot(kind='barh')
    ax.set_xlabel('count')
    ax.set_ylabel('outcome')
    ax.set_title(col_name)

In [None]:
create_bar_col_binary(sub_metadata, 'dyslipidemia_v2')