In [6]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [7]:
df = pd.read_csv('dataset.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)

(560, 13)
Index(['angina', 'DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020',
       'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


In [8]:
variable_values_dict = {}
"""
DPQ020 - Feeling down, depressed, or hopeless
Categorical (0: Not at all, 1: Several days, 2: More than half the days, 3: Nearly every day)
"""
variable_values_dict['DPQ020'] = ('C', [0,1,2,3])
"""
DPQ090 - Thought you would be better off dead
Categorical (0: Not at all, 1: Several days, 2: More than half the days, 3: Nearly every day)
"""
variable_values_dict['DPQ090'] = ('C', [0,1,2,3])
"""
OHQ850 - Ever had treatment for gum disease?
Binary (Yes = 1/No = 2)
"""
variable_values_dict['OHQ850'] = ('B', [1,2])
"""
OHQ835 - Do you think you might have gum disease?
Binary (Yes = 1/No = 2)
"""
variable_values_dict['OHQ835'] = ('B', [1,2])
"""
OHQ620 - How often last yr had aching in mouth?
Categorical (1: Very often, 2:	Fairly often, 3:	Occasionally, 4:	Hardly ever, 5:	Never)
"""
variable_values_dict['OHQ620'] = ('C', [1,2,3,4,5])
"""
SMQ020 - Smoked at least 100 cigarettes in life
Binary (Yes = 1/No = 2)
"""
variable_values_dict['SMQ020'] = ('B', [1,2])
"""
WHD020 - Current self-reported weight (pounds)
Numerical (75-493)
"""
variable_values_dict['WHD020'] = ('N', [75, 493])
"""
PAQ650 - Vigorous recreational activities
Binary (Yes = 1/No = 2)
"""
variable_values_dict['PAQ650'] = ('B', [1,2])
"""
BPQ020 - Ever told you had high blood pressure
Binary (Yes = 1/No = 2)
"""
variable_values_dict['BPQ020'] = ('B', [1,2])
"""
RIAGENDR - Gender
Binary (Male = 1/Female = 2)
"""
variable_values_dict['RIAGENDR'] = ('B', [1,2])
"""
RIDAGEYR - Age in years at screening
Numerical (0-80)
"""
variable_values_dict['RIDAGEYR'] = ('N', [0, 80])
"""
DBQ700 - Age in years at screening
Categorical (1: Excellent, 2: Very good, 3: Good, 4: Fair, 5: Poor)
"""
variable_values_dict['DBQ700'] = ('C', [1,2,3,4,5])

# Statistical Testing (Stratify Features -> Compare Angina)

In [9]:
for feature in df.columns[1:]:
  type, variable_values = variable_values_dict[feature]
  print("Feature: {}".format(feature))
  if type == 'B':
    group_A = df[df[feature] == variable_values[0]]['angina']
    group_B = df[df[feature] == variable_values[1]]['angina']
    t_statistic, pvalue = stats.ttest_ind(group_A, group_B, equal_var=False, alternative='two-sided')
    print("Welsh's T-Test: T statistic = {} pvalue = {}".format(t_statistic, pvalue))
  elif type == 'C':
    groups = []
    for variable_value in variable_values:
      group = df[df[feature] == variable_value]['angina']
      groups.append(df[df[feature] == variable_value]['angina']) if len(group) > 0 else None
    result = stats.alexandergovern(*groups)
    print('Alexander Govern Test: A statistic = {}  pvalue = {}'.format(result.statistic, result.pvalue))
    h_statistic, pvalue = stats.kruskal(*groups)
    print(('Kruskal Test: H statistic = {}  pvalue = {}'.format(h_statistic, pvalue)))
    result = stats.tukey_hsd(*groups)
    print(result)
  elif type == 'N':
    continue    
  

Feature: DPQ020
Alexander Govern Test: A statistic = 8.469024676734184  pvalue = 0.014486873167674675
Kruskal Test: H statistic = 7.9417100246351975  pvalue = 0.018857303014054323
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.058     0.705    -0.231     0.114
 (0 - 2)      0.173     0.046     0.002     0.344
 (1 - 0)      0.058     0.705    -0.114     0.231
 (1 - 2)      0.232     0.022     0.027     0.436
 (2 - 0)     -0.173     0.046    -0.344    -0.002
 (2 - 1)     -0.232     0.022    -0.436    -0.027

Feature: DPQ090
Alexander Govern Test: A statistic = 1.2784146804350067  pvalue = 0.5277105532364073
Kruskal Test: H statistic = 1.14874057731202  pvalue = 0.5630593220360587
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.228     0.541    -0.289     0.745
 (0 - 2)      0.038     0.967    -0.330     0.405
 (1 - 0)

# Statistical Testing (Stratify Angina -> Compare Features)