In [1]:
%matplotlib inline

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Feature Manipulation

In [5]:
asthma_data = pd.read_csv("../data/asthma_data_reduced.csv")
asthma_data

Unnamed: 0,age,gender,ethnicity,education_level,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,...,gastroesophageal_reflux,lung_functionfev1,lung_functionfvc,wheezing,shortness_of_breath,chest_tightness,coughing,nighttime_symptoms,exercise_induced,diagnosis
0,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,...,0,1.369051,4.941206,0,0,1,0,0,1,0
1,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,1.969838,...,0,2.197767,1.702393,1,0,0,1,1,1,0
2,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,1.460593,...,0,1.698011,5.022553,1,1,1,0,1,1,0
3,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,...,0,3.032037,2.300159,1,0,1,1,1,0,0
4,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,0.980875,...,0,3.470589,3.067944,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,0,2,29.059613,0,3.019854,6.119637,8.300960,2.483829,...,0,3.125249,5.166032,0,1,0,0,0,1,1
2388,18,1,0,1,20.740850,0,5.805180,4.386992,7.731192,7.733983,...,0,1.132977,5.509502,0,0,0,1,1,0,1
2389,54,0,3,2,37.079560,0,4.735169,8.214064,7.483521,2.794847,...,0,1.685962,3.346877,1,0,1,1,0,1,1
2390,46,1,0,2,23.444712,0,9.672637,7.362861,6.717272,9.448862,...,1,3.481549,1.713274,0,1,1,0,1,1,0


In [9]:
# check the correlations of diagnosis with the other variables

correlation_of_diagnosis = asthma_data.corr()['diagnosis']

correlation_of_diagnosis = correlation_of_diagnosis.drop('diagnosis')
correlation_of_diagnosis

age                       -0.015111
gender                     0.003128
ethnicity                  0.017124
education_level            0.008185
bmi                       -0.012522
smoking                   -0.019321
physical_activity          0.005066
diet_quality              -0.003149
sleep_quality              0.018022
pollution_exposure        -0.004535
pollen_exposure            0.015099
dust_exposure             -0.025972
pet_allergy               -0.013078
family_history_asthma     -0.001334
history_of_allergies      -0.001951
eczema                    -0.008592
hay_fever                 -0.019141
gastroesophageal_reflux    0.022770
lung_functionfev1          0.023336
lung_functionfvc           0.029629
wheezing                   0.027197
shortness_of_breath       -0.015281
chest_tightness           -0.039278
coughing                  -0.024193
nighttime_symptoms        -0.021965
exercise_induced           0.053956
Name: diagnosis, dtype: float64

In [18]:
# check the significant differences

non_asthma_group = asthma_data[asthma_data.diagnosis == 0]
asthma_group = asthma_data[asthma_data.diagnosis == 1]

results = {}

# Perform t-tests for each column
for col in asthma_data.columns:
    if col != 'diagnosis':
        stat, p_value = stats.ttest_ind(non_asthma_group[col], asthma_group[col], equal_var = False)
        results[col] = {'t-statistic': stat, 'p-value': p_value}

# Display the results
significance_result = pd.DataFrame(results).T

In [19]:
significance_result

Unnamed: 0,t-statistic,p-value
age,0.703697,0.48283
gender,-0.152401,0.879095
ethnicity,-0.78856,0.431752
education_level,-0.371992,0.710483
bmi,0.634656,0.526704
smoking,1.031413,0.304128
physical_activity,-0.249439,0.803395
diet_quality,0.153697,0.878076
sleep_quality,-0.896841,0.371373
pollution_exposure,0.214447,0.83052


Significant differences between the non-asthmatic and asthmatic groups are observed only for the variable exrcise_induced.

In [29]:
# normalize the data
scaler = StandardScaler()
asthma_data_scaled = scaler.fit_transform(asthma_data)

In [30]:
pca = PCA(n_components = None)

# Fit and transform the data
principal_components = pca.fit_transform(asthma_data_scaled)

# Convert to DataFrame
asthma_data_pca = pd.DataFrame(principal_components)

In [31]:
asthma_data_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.634364,-0.365653,-0.426196,0.526713,-1.115539,0.694374,-2.808232,0.889435,-0.421353,-2.044391,...,-1.987158,-0.320239,-0.760775,-0.630635,0.354134,-1.278659,1.649465,-0.743742,0.294860,-0.252524
1,-0.361901,0.328149,0.562730,-0.854207,0.697920,-0.050123,0.190305,-1.072567,-1.313381,1.046570,...,-0.840383,0.707001,0.068987,0.551656,-1.183109,0.190642,-0.947397,0.723622,-1.392289,-1.413144
2,-0.847001,0.381893,0.184664,-0.541710,2.140504,-0.349391,-1.280278,2.138785,-0.871082,-0.668416,...,-1.205898,-0.014115,-1.399813,0.773335,-0.122183,0.130066,-0.574995,0.770973,0.413862,1.321873
3,-1.397493,0.419090,-0.986574,-1.186949,0.498224,0.461931,1.258881,-0.352078,-1.050594,1.076752,...,-1.150058,1.105754,0.689090,1.568256,-0.347667,1.076022,0.889669,0.196012,-0.100222,-0.572819
4,-0.465274,0.621505,0.383216,0.048108,-0.387749,0.737753,0.000281,0.669904,1.670122,0.878917,...,0.315426,1.053508,-3.058042,-0.579949,0.532191,-0.501867,-0.598035,0.690152,0.678437,0.990152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,2.659893,1.665025,1.606456,1.784586,1.152881,-0.245548,-1.454619,-0.614773,1.104595,-0.896707,...,1.094519,-0.633464,1.215438,-0.572630,-1.527057,-1.251675,1.243791,1.336178,0.864864,-0.873334
2388,0.670031,-0.412765,1.167198,-0.366568,-0.823472,-0.418119,-0.222473,-0.131497,1.233895,-1.942689,...,0.916195,-0.726668,1.913940,-0.524589,-0.907646,1.625587,0.545152,1.804341,1.764711,-0.076341
2389,-0.664479,2.566233,1.161496,-0.235539,0.037880,-0.564759,0.160632,1.619887,-1.069313,-0.503213,...,0.454712,2.365140,-0.130588,0.461483,1.000655,0.178340,2.140773,1.796794,1.863438,-0.732137
2390,0.127062,-1.131337,-0.187598,0.795730,1.863462,1.207429,0.239842,-0.932205,-1.029804,-1.291728,...,1.533225,-1.379875,-0.263802,-1.643304,1.026200,0.334277,-0.582179,-0.536382,-0.871235,-0.364018


In [32]:
asthma_data_pca.to_csv('../data/asthma_data_pca.csv', index = False)