In [25]:
import pandas as pd
import numpy as np
import warnings
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

In [3]:
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/kaggle/input/asthma/asthma_disease_data.csv')

In [5]:
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,PhysicalActivity,DietQuality,SleepQuality,...,LungFunctionFEV1,LungFunctionFVC,Wheezing,ShortnessOfBreath,ChestTightness,Coughing,NighttimeSymptoms,ExerciseInduced,Diagnosis,DoctorInCharge
0,5034,63,0,1,0,15.848744,0,0.894448,5.488696,8.701003,...,1.369051,4.941206,0,0,1,0,0,1,0,Dr_Confid
1,5035,26,1,2,2,22.757042,0,5.897329,6.341014,5.153966,...,2.197767,1.702393,1,0,0,1,1,1,0,Dr_Confid
2,5036,57,0,2,1,18.395396,0,6.739367,9.196237,6.840647,...,1.698011,5.022553,1,1,1,0,1,1,0,Dr_Confid
3,5037,40,1,2,1,38.515278,0,1.404503,5.826532,4.253036,...,3.032037,2.300159,1,0,1,1,1,0,0,Dr_Confid
4,5038,61,0,0,3,19.283802,0,4.604493,3.127048,9.625799,...,3.470589,3.067944,1,1,1,0,0,1,0,Dr_Confid


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   PatientID               2392 non-null   int64  
 1   Age                     2392 non-null   int64  
 2   Gender                  2392 non-null   int64  
 3   Ethnicity               2392 non-null   int64  
 4   EducationLevel          2392 non-null   int64  
 5   BMI                     2392 non-null   float64
 6   Smoking                 2392 non-null   int64  
 7   PhysicalActivity        2392 non-null   float64
 8   DietQuality             2392 non-null   float64
 9   SleepQuality            2392 non-null   float64
 10  PollutionExposure       2392 non-null   float64
 11  PollenExposure          2392 non-null   float64
 12  DustExposure            2392 non-null   float64
 13  PetAllergy              2392 non-null   int64  
 14  FamilyHistoryAsthma     2392 non-null   

In [7]:
df.isnull().sum()

PatientID                 0
Age                       0
Gender                    0
Ethnicity                 0
EducationLevel            0
BMI                       0
Smoking                   0
PhysicalActivity          0
DietQuality               0
SleepQuality              0
PollutionExposure         0
PollenExposure            0
DustExposure              0
PetAllergy                0
FamilyHistoryAsthma       0
HistoryOfAllergies        0
Eczema                    0
HayFever                  0
GastroesophagealReflux    0
LungFunctionFEV1          0
LungFunctionFVC           0
Wheezing                  0
ShortnessOfBreath         0
ChestTightness            0
Coughing                  0
NighttimeSymptoms         0
ExerciseInduced           0
Diagnosis                 0
DoctorInCharge            0
dtype: int64

In [8]:
df.drop(['DoctorInCharge','PatientID'], axis=1, inplace=True)

In [9]:
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
models = {
    'RandomForest': {
        'model': RandomForestClassifier(random_state=1),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20],
            'min_samples_split': [5, 10],
            'min_samples_leaf': [2, 4]
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=1),
        'params': {
            'C': [1, 10, 100],
            'gamma': [0.01, 0.001],
            'kernel': ['rbf', 'linear']
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingClassifier(random_state=1),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 0.9]
        }
    }
}

best_estimators = {}
best_scores = {}
best_cv_scores = {}

for model_name, model_data in models.items():
    clf = GridSearchCV(model_data['model'], model_data['params'], cv=5, scoring='accuracy', n_jobs=-1)
    clf.fit(X_train, y_train)
    best_estimators[model_name] = clf.best_estimator_
    best_scores[model_name] = clf.best_score_
    cv_score = cross_val_score(clf.best_estimator_, X_train, y_train, cv=5).mean()
    best_cv_scores[model_name] = cv_score
    print(f"Best parameters for {model_name}: {clf.best_params_}")
    print(f"Best cross-validation score for {model_name}: {cv_score}")

best_model_name = max(best_cv_scores, key=best_cv_scores.get)
best_model = best_estimators[best_model_name]

print(f"Best model: {best_model_name} with CV score: {best_cv_scores[best_model_name]}")

selector = RFE(best_model, n_features_to_select=10, step=1)
selector = selector.fit(X_train, y_train)

X_train_rfe = selector.transform(X_train)
X_test_rfe = selector.transform(X_test)

best_model.fit(X_train_rfe, y_train)

Best parameters for RandomForest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 50}
Best cross-validation score for RandomForest: 0.9472038057222534
Best parameters for SVM: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation score for SVM: 0.9472038057222534
Best parameters for GradientBoosting: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
Best cross-validation score for GradientBoosting: 0.9472038057222534
Best model: RandomForest with CV score: 0.9472038057222534


In [21]:
y_train_pred = best_model.predict(X_train_rfe)
y_test_pred = best_model.predict(X_test_rfe)

In [22]:
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [23]:
train_accuracy, test_accuracy

(0.9560899111343439, 0.9519832985386222)

In [26]:
with open('asthma.pkl', 'wb') as file:
    pickle.dump(best_model, file)