### Hypothesis testing

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

warnings.filterwarnings('ignore')

In [14]:
df = pd.read_csv('../data/asthma_disease_data.csv')

In [21]:
# T -tests

numerical_cols = ['Age', 'BMI', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 
                  'PollutionExposure', 'PollenExposure', 'DustExposure', 
                  'LungFunctionFEV1', 'LungFunctionFVC', 'SymptomScore',
                  'ExposureScore', 'HealthScore', 'FEV1_FVC_Ratio']

print("T-tests for Numerical Variables (Asthma vs No Asthma):")
print("=" * 70)

results = []
for col in numerical_cols:
    asthma = df[df['Diagnosis'] == 1][col]
    no_asthma = df[df['Diagnosis'] == 0][col]
    
    t_stat, p_value = stats.ttest_ind(asthma, no_asthma, nan_policy='omit')
    
    results.append({
        'Feature': col,
        'T-Statistic': t_stat,
        'P-Value': p_value,
        'Significant': p_value < 0.05
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values('P-Value'))

T-tests for Numerical Variables (Asthma vs No Asthma):
              Feature  T-Statistic   P-Value  Significant
9     LungFunctionFVC     1.449126  0.147434        False
7        DustExposure    -1.270155  0.204153        False
8    LungFunctionFEV1     1.141140  0.253926        False
4        SleepQuality     0.881204  0.378296        False
0                 Age    -0.738812  0.460094        False
6      PollenExposure     0.738216  0.460456        False
1                 BMI    -0.612226  0.540446        False
11      ExposureScore    -0.427602  0.668979        False
10       SymptomScore    -0.411817  0.680511        False
12        HealthScore     0.392697  0.694579        False
13     FEV1_FVC_Ratio    -0.336486  0.736534        False
2    PhysicalActivity     0.247659  0.804420        False
5   PollutionExposure    -0.221717  0.824553        False
3         DietQuality    -0.153947  0.877664        False


In [22]:
# Categorical vars; Chi-square test

categorical_cols = ['Gender', 'Smoking', 'PetAllergy', 'FamilyHistoryAsthma', 
                   'HistoryOfAllergies', 'Eczema', 'HayFever', 'GastroesophagealReflux',
                   'AgeGroup', 'BMICategory']

print("\nChi-square Tests for Categorical Variables:")
print("=" * 70)

chi2_results = []
for col in categorical_cols:
    contingency_table = pd.crosstab(df[col], df['Diagnosis'])
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    
    chi2_results.append({
        'Feature': col,
        'Chi2-Statistic': chi2,
        'P-Value': p_value,
        'Significant': p_value < 0.05
    })

chi2_df = pd.DataFrame(chi2_results)
print(chi2_df.sort_values('P-Value'))


Chi-square Tests for Categorical Variables:
                  Feature  Chi2-Statistic   P-Value  Significant
7  GastroesophagealReflux        0.974628  0.323529        False
6                HayFever        0.689073  0.406480        False
1                 Smoking        0.660570  0.416359        False
2              PetAllergy        0.265903  0.606093        False
9             BMICategory        1.350804  0.717106        False
8                AgeGroup        2.023988  0.731347        False
5                  Eczema        0.091887  0.761792        False
0                  Gender        0.003693  0.951543        False
4      HistoryOfAllergies        0.000002  0.998995        False
3     FamilyHistoryAsthma        0.000000  1.000000        False


In [23]:
results_df.to_csv('../data/asthma_disease_data_t_tests.csv', index=False)

In [24]:
chi2_df.to_csv('../data/asthma_disease_data_chi2.csv', index=False)