In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # for EDA plots
import seaborn as sns # for EDA plots
from scipy import stats # for Shapiro-Wilks test, ttest_ind
from sklearn.model_selection import train_test_split # for splitting dataframes into train & test samples

In [28]:
# Read in the 2 DFs from CSV file
df_clean = pd.read_csv('data/df_clean.csv')
df_impute = pd.read_csv('data/df_impute.csv')

1) Are average X different between Male/Female
2) Are average X different between Married vs. Unmarried
3) Are average X different between pair-wise combinations of races?
4) Are average X different between "Has metabolic syndrome" and "Does not have metabolic syndrome"?

BMI, WaistCirc, Albuminuria, UrAlbCr, UricAcid, BloodGlucose, HDL, Triglycerides.

In [29]:
df_impute.columns

Index(['Unnamed: 0', 'seqn', 'Age', 'Sex', 'Marital', 'Income', 'Race',
       'WaistCirc', 'BMI', 'Albuminuria', 'UrAlbCr', 'UricAcid',
       'BloodGlucose', 'HDL', 'Triglycerides', 'MetabolicSyndrome'],
      dtype='object')

In [30]:
# What are the unique categorical variables for Sex, Marital, Race, MetabolicSyndrome?
print (f"\nCategories for column: Sex:\n{df_impute['Sex'].unique()}")
print (f"\nCategories for column: Race:\n{df_impute['Race'].unique()}")
print (f"\nCategories for column: Marital:\n{df_impute['Marital'].unique()}")
print (f"\nCategories for column: MetabolicSyndrome:\n{df_impute['MetabolicSyndrome'].unique()}")


Categories for column: Sex:
['Male' 'Female']

Categories for column: Race:
['White' 'Asian' 'Black' 'MexAmerican' 'Hispanic' 'Other']

Categories for column: Marital:
['Single' 'Married' 'Widowed' 'Divorced' 'Separated']

Categories for column: MetabolicSyndrome:
[0 1]


# Hypothesis Tests for "Has MetabolicSyndrome" vs. "Doesn't Have MetabolicSyndrome"

1. Levene's test for equal variance
2. t-test

In [34]:
# First Use Levene's test to verify equal variance between 2 groups on dimensions:
# 'WaistCirc', 'BMI','Albuminuria', 'UrAlbCr', 'UricAcid', 'BloodGlucose', 'HDL', 'Triglycerides'

# Define a list with the dimensions we want
dimensions = ['WaistCirc', 'BMI', 'Albuminuria', 'UrAlbCr', 'UricAcid', 'BloodGlucose', 'HDL', 'Triglycerides']

# Define the masks
has_ms_mask = df_impute['MetabolicSyndrome'] == 1
no_ms_mask = df_impute['MetabolicSyndrome'] == 0

levenes_results = []

for dimension in dimensions:
    dimension_has_ms = df_impute[has_ms_mask][dimension]
    dimension_no_ms  = df_impute[no_ms_mask][dimension]
    stat, p = stats.levene(dimension_has_ms, dimension_no_ms)
    levenes_results.append(dict({'dimension_name' : dimension, 'stat' : stat, 'p': p}))
    print (f"Levene test result for {dimension}: p = {p}, Result: Variances are {'NOT ' if p < 0.05 else ''}equal")

Levene test result for WaistCirc: p = 0.17439731450012333, Result: Variances are equal
Levene test result for BMI: p = 3.744839622941711e-05, Result: Variances are NOT equal
Levene test result for Albuminuria: p = 4.017997154140853e-13, Result: Variances are NOT equal
Levene test result for UrAlbCr: p = 0.00011067393676236355, Result: Variances are NOT equal
Levene test result for UricAcid: p = 0.0022105965941049094, Result: Variances are NOT equal
Levene test result for BloodGlucose: p = 1.668406034475705e-23, Result: Variances are NOT equal
Levene test result for HDL: p = 1.2286233173573101e-08, Result: Variances are NOT equal
Levene test result for Triglycerides: p = 2.658370873768701e-45, Result: Variances are NOT equal


# Levene's test indicates the following:

Variances are equal for the 2 groups (has_ms, no_ms) for these following dimensions, and we can use standard t-test:
1. WaistCirc

Variances are NOT equal for the 2 groups (has_ms, no_ms) for these following dimensions, and we need to use Welch's t-test (pass 'False' for equal_var in stats.ttest):
1. BMI
2. Albuminuria
3. UrAlbCr
4. UricAcid
5. BloodGlucose
6. HDL
7. Triglycerides

In [38]:
# Do t-tests for all dimensions!

t_results = []

for dict_entry in levenes_results:
    dimension = dict_entry['dimension_name']
    dimension_p = dict_entry['p']
    equal_var_param = False # Default (most of our dimensions are not equal variance)
    if dimension_p > 0.05:
        # Set 'equal_var_param' set to 'True' (meaning, 2 datasets ARE equal variance)
        print (f"equal variances for dimension {dimension}")
        equal_var_param = True

    # Run the t-test and store the results
    stat, p = stats.ttest_ind(df_impute[has_ms_mask][dimension], df_impute[no_ms_mask][dimension], equal_var=equal_var_param)
    t_results.append(dict({'dimension_name' : dimension, 'equal_var_param' : equal_var_param, 'stat' : stat, 'p' : p}))


equal variances for dimension WaistCirc


In [50]:
t_results

[{'dimension_name': 'WaistCirc',
  'equal_var_param': True,
  'stat': 26.165407054357438,
  'p': 1.5462850516600822e-131},
 {'dimension_name': 'BMI',
  'equal_var_param': False,
  'stat': 20.881735567073623,
  'p': 5.7439237164190525e-84},
 {'dimension_name': 'Albuminuria',
  'equal_var_param': False,
  'stat': 6.523071363860894,
  'p': 1.0323227968063208e-10},
 {'dimension_name': 'UrAlbCr',
  'equal_var_param': False,
  'stat': 3.381374389078004,
  'p': 0.0007485165882493893},
 {'dimension_name': 'UricAcid',
  'equal_var_param': False,
  'stat': 11.853609110639535,
  'p': 6.035533456367002e-31},
 {'dimension_name': 'BloodGlucose',
  'equal_var_param': False,
  'stat': 15.147307212572214,
  'p': 9.117590017521449e-47},
 {'dimension_name': 'HDL',
  'equal_var_param': False,
  'stat': -19.91855435350617,
  'p': 3.2667959101286097e-80},
 {'dimension_name': 'Triglycerides',
  'equal_var_param': False,
  'stat': 18.310620932503344,
  'p': 6.414711048677813e-64}]

In [56]:
# Print out the results

print (f"t-test results for dimensions grouped by has metabolic syndrome vs. does not have metabolic syndrome:")

significantly_different_list = ''
not_significantly_different_list = ''

for result in t_results:
    if result['p'] < 0.05:
        significantly_different_list += result['dimension_name'] + ', '
    else:
        not_significantly_different_list += result['dimension_name'] + ', '

# Get rid of last ','
significantly_different_list = significantly_different_list[:-2]
not_significantly_different_list = not_significantly_different_list[:-1]

print (f"\nSignificantly Different Means:\n\t{significantly_different_list}")
print (f"\nNOT Significantly Different Means:\n\t{not_significantly_different_list}")


t-test results for dimensions grouped by has metabolic syndrome vs. does not have metabolic syndrome:

Significantly Different Means:
	WaistCirc, BMI, Albuminuria, UrAlbCr, UricAcid, BloodGlucose, HDL, Triglycerides

NOT Significantly Different Means:
	


# Results for Group Means for "Has Metabolic Syndrome" vs. "Does Not Have Metabolic Syndrome" Dimensions:

For all Dimensions, the means are statistically different for those who "have metabolic syndrome" vs. those who "do not have metabolic syndrome"
    
Dimensions are:
* WaistCirc
* BMI
* Albuminuria
* UrAlbCr
* UricAcid
* BloodGlucose
* HDL
* Triglycerides