In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/asthma_disease_data.csv')

In [None]:
# Create age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 18, 35, 50, 65, 100], 
                        labels=['Child', 'Young Adult', 'Adult', 'Middle Aged', 'Senior'])

# Create BMI categories 
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

df['BMICategory'] = df['BMI'].apply(categorize_bmi)

# Create symptom score (sum of all respiratory symptoms)
symptom_cols = ['Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing', 'NighttimeSymptoms', 'ExerciseInduced']
df['SymptomScore'] = df[symptom_cols].sum(axis=1)

# Create exposure score
exposure_cols = ['PollutionExposure', 'PollenExposure', 'DustExposure']
df['ExposureScore'] = df[exposure_cols].sum(axis=1)

# Create overall health score
df['HealthScore'] = df['DietQuality'] + df['SleepQuality'] + df['PhysicalActivity']

# Create lung function ratio (FEV1/FVC)
df['FEV1_FVC_Ratio'] = df['LungFunctionFEV1'] / df['LungFunctionFVC']

print("New features created:")
print(df[['AgeGroup', 'BMICategory', 'SymptomScore', 'ExposureScore', 'HealthScore', 'FEV1_FVC_Ratio']].head())