In [22]:
import numpy as np
import pandas as pd
import scipy.stats as stats

In [23]:
df = pd.read_csv('dataset.csv').drop(columns= ['Unnamed: 0', 'SEQN'])
print(df.shape)
print(df.columns)
df_grade_2 = pd.read_csv('dataset_grade_2.csv').drop(columns= ['Unnamed: 0', 'SEQN'])

(560, 13)
Index(['angina', 'DPQ020', 'DPQ090', 'OHQ850', 'OHQ835', 'OHQ620', 'SMQ020',
       'WHD020', 'PAQ650', 'BPQ020', 'RIAGENDR', 'RIDAGEYR', 'DBQ700'],
      dtype='object')


In [24]:
variable_values_dict = {}
"""
DPQ020 - Feeling down, depressed, or hopeless
Categorical (0: Not at all, 1: Several days, 2: More than half the days, 3: Nearly every day)
"""
variable_values_dict['DPQ020'] = ('C', [0,1,2,3])

"""
DPQ090 - Thought you would be better off dead
Categorical (0: Not at all, 1: Several days, 2: More than half the days, 3: Nearly every day)
"""
variable_values_dict['DPQ090'] = ('C', [0,1,2,3])

"""
OHQ850 - Ever had treatment for gum disease?
Binary (Yes = 1/No = 2)
"""
variable_values_dict['OHQ850'] = ('B', [1,2])

"""
OHQ835 - Do you think you might have gum disease?
Binary (Yes = 1/No = 2)
"""
variable_values_dict['OHQ835'] = ('B', [1,2])

"""
OHQ620 - How often last yr had aching in mouth?
Categorical (1: Very often, 2:	Fairly often, 3:	Occasionally, 4:	Hardly ever, 5:	Never)
"""
variable_values_dict['OHQ620'] = ('C', [1,2,3,4,5])

"""
SMQ020 - Smoked at least 100 cigarettes in life
Binary (Yes = 1/No = 2)
"""
variable_values_dict['SMQ020'] = ('B', [1,2])

"""
WHD020 - Current self-reported weight (pounds)
Numerical (75-493)
"""
variable_values_dict['WHD020'] = ('N', [75, 493])

"""
PAQ650 - Vigorous recreational activities
Binary (Yes = 1/No = 2)
"""
variable_values_dict['PAQ650'] = ('B', [1,2])

"""
BPQ020 - Ever told you had high blood pressure
Binary (Yes = 1/No = 2)
"""
variable_values_dict['BPQ020'] = ('B', [1,2])

"""
RIAGENDR - Gender
Binary (Male = 1/Female = 2)
"""
variable_values_dict['RIAGENDR'] = ('B', [1,2])

"""
RIDAGEYR - Age in years at screening
Numerical (0-80)
"""
variable_values_dict['RIDAGEYR'] = ('N', [0, 80])

"""
DBQ700 - How healthy is the diet
Categorical (1: Excellent, 2: Very good, 3: Good, 4: Fair, 5: Poor)
"""
variable_values_dict['DBQ700'] = ('C', [1,2,3,4,5])

# Statistical Testing (Stratify Features -> Compare Angina) Grade 1

In [25]:
for feature in df.columns[1:]:
  type, variable_values = variable_values_dict[feature]
  print("Feature: {}".format(feature))
  if type == 'B':
    group_A = df[df[feature] == variable_values[0]]['angina']
    group_B = df[df[feature] == variable_values[1]]['angina']
    t_statistic, pvalue = stats.ttest_ind(group_A, group_B, equal_var=False, alternative='two-sided')
    print("Welsh's T-Test: T statistic = {} pvalue = {}".format(t_statistic, pvalue))
  elif type == 'C':
    groups = []
    for variable_value in variable_values:
      group = df[df[feature] == variable_value]['angina']
      groups.append(df[df[feature] == variable_value]['angina']) if len(group) > 0 else None
    result = stats.alexandergovern(*groups)
    print('Alexander Govern Test: A statistic = {}  pvalue = {}'.format(result.statistic, result.pvalue))
    h_statistic, pvalue = stats.kruskal(*groups)
    print(('Kruskal Test: H statistic = {}  pvalue = {}'.format(h_statistic, pvalue)))
    result = stats.tukey_hsd(*groups)
    print(result)
  elif type == 'N':
    continue    
  

Feature: DPQ020
Alexander Govern Test: A statistic = 8.469024676734184  pvalue = 0.014486873167674675
Kruskal Test: H statistic = 7.9417100246351975  pvalue = 0.018857303014054323
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.058     0.705    -0.231     0.114
 (0 - 2)      0.173     0.046     0.002     0.344
 (1 - 0)      0.058     0.705    -0.114     0.231
 (1 - 2)      0.232     0.022     0.027     0.436
 (2 - 0)     -0.173     0.046    -0.344    -0.002
 (2 - 1)     -0.232     0.022    -0.436    -0.027

Feature: DPQ090
Alexander Govern Test: A statistic = 1.2784146804350067  pvalue = 0.5277105532364073
Kruskal Test: H statistic = 1.14874057731202  pvalue = 0.5630593220360587
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.228     0.541    -0.289     0.745
 (0 - 2)      0.038     0.967    -0.330     0.405
 (1 - 0)

# Statistical Testing (Stratify Features -> Compare Angina) Grade 2

In [26]:
for feature in df_grade_2.columns[1:]:
  type, variable_values = variable_values_dict[feature]
  print("Feature: {}".format(feature))
  if type == 'B':
    group_A = df_grade_2[df_grade_2[feature] == variable_values[0]]['angina']
    group_B = df_grade_2[df_grade_2[feature] == variable_values[1]]['angina']
    t_statistic, pvalue = stats.ttest_ind(group_A, group_B, equal_var=False, alternative='two-sided')
    print("Welsh's T-Test: T statistic = {} pvalue = {}".format(t_statistic, pvalue))
  elif type == 'C':
    groups = []
    for variable_value in variable_values:
      group = df_grade_2[df_grade_2[feature] == variable_value]['angina']
      groups.append(df_grade_2[df_grade_2[feature] == variable_value]['angina']) if len(group) > 0 else None
    result = stats.alexandergovern(*groups)
    print('Alexander Govern Test: A statistic = {}  pvalue = {}'.format(result.statistic, result.pvalue))
    h_statistic, pvalue = stats.kruskal(*groups)
    print(('Kruskal Test: H statistic = {}  pvalue = {}'.format(h_statistic, pvalue)))
    result = stats.tukey_hsd(*groups)
    print(result)
  elif type == 'N':
    continue    

Feature: DPQ020
Alexander Govern Test: A statistic = 2.0269774551795976  pvalue = 0.36295053268812827
Kruskal Test: H statistic = 2.33645574481471  pvalue = 0.31091743871774197
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.008     0.990    -0.158     0.141
 (0 - 2)     -0.094     0.294    -0.242     0.054
 (1 - 0)      0.008     0.990    -0.141     0.158
 (1 - 2)     -0.085     0.493    -0.263     0.092
 (2 - 0)      0.094     0.294    -0.054     0.242
 (2 - 1)      0.085     0.493    -0.092     0.263

Feature: DPQ090
Alexander Govern Test: A statistic = nan  pvalue = nan
Kruskal Test: H statistic = 3.734693877551008  pvalue = 0.15453310422379654




Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.316     0.217    -0.134     0.765
 (0 - 2)      0.173     0.400    -0.147     0.493
 (1 - 0)     -0.316     0.217    -0.765     0.134
 (1 - 2)     -0.143     0.771    -0.642     0.356
 (2 - 0)     -0.173     0.400    -0.493     0.147
 (2 - 1)      0.143     0.771    -0.356     0.642

Feature: OHQ850
Welsh's T-Test: T statistic = 0.39532863328730194 pvalue = 0.692965993205505
Feature: OHQ835
Welsh's T-Test: T statistic = 0.3704126898276123 pvalue = 0.7112988638287174
Feature: OHQ620
Alexander Govern Test: A statistic = 4.015630244701992  pvalue = 0.40389465897324145
Kruskal Test: H statistic = 4.235775078936377  pvalue = 0.3750365325415564
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)      0.093     0.753    -0.120     0.306
 (0 - 2)      0.110     0.444    -0.069     0.289
 

# Statistical Testing (Stratify Features -> Compare Gum Disease)

In [27]:
for feature in df_grade_2.columns[1:]:
  type, variable_values = variable_values_dict[feature]
  print("Feature: {}".format(feature))
  if type == 'B':
    group_A = df_grade_2[df_grade_2[feature] == variable_values[0]]['OHQ850']
    group_B = df_grade_2[df_grade_2[feature] == variable_values[1]]['OHQ850']
    t_statistic, pvalue = stats.ttest_ind(group_A, group_B, equal_var=False, alternative='two-sided')
    print("Welsh's T-Test: T statistic = {} pvalue = {}".format(t_statistic, pvalue))
  elif type == 'C':
    groups = []
    for variable_value in variable_values:
      group = df_grade_2[df_grade_2[feature] == variable_value]['OHQ850']
      groups.append(df_grade_2[df_grade_2[feature] == variable_value]['OHQ850']) if len(group) > 0 else None
    result = stats.alexandergovern(*groups)
    print('Alexander Govern Test: A statistic = {}  pvalue = {}'.format(result.statistic, result.pvalue))
    h_statistic, pvalue = stats.kruskal(*groups)
    print(('Kruskal Test: H statistic = {}  pvalue = {}'.format(h_statistic, pvalue)))
    result = stats.tukey_hsd(*groups)
    print(result)
  elif type == 'N':
    continue    

Feature: DPQ020
Alexander Govern Test: A statistic = 2.6818519579668307  pvalue = 0.2616033172177732
Kruskal Test: H statistic = 2.606439494366414  pvalue = 0.2716557206836349
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.096     0.315    -0.250     0.059
 (0 - 2)     -0.071     0.519    -0.224     0.082
 (1 - 0)      0.096     0.315    -0.059     0.250
 (1 - 2)      0.024     0.947    -0.159     0.208
 (2 - 0)      0.071     0.519    -0.082     0.224
 (2 - 1)     -0.024     0.947    -0.208     0.159

Feature: DPQ090
Alexander Govern Test: A statistic = 1.8105175692414068  pvalue = 0.4044372094295351
Kruskal Test: H statistic = 1.7503968253968312  pvalue = 0.41677931716520267
Tukey's HSD Pairwise Group Comparisons (95.0% Confidence Interval)
Comparison  Statistic  p-value  Lower CI  Upper CI
 (0 - 1)     -0.202     0.591    -0.697     0.293
 (0 - 2)     -0.154     0.546    -0.506     0.198
 (1 - 0) 