In [14]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [15]:
# Loading the preprocessed data
df = pd.read_csv("after_pipelining2.csv")



In [16]:
# Verifying the first few rows of the data
print("DataFrame Columns:", df.columns.tolist())
df.head()

DataFrame Columns: ['Smoking_Status', 'Biomass_Fuel_Exposure', 'Occupational_Exposure', 'Family_History_COPD', 'Location', 'Respiratory_Infections_Childhood', 'COPD_Diagnosis', 'Age_Category', 'BMI_category', 'Gender_encoded', 'Occupation_Family_History_Interaction']


Unnamed: 0,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,Location,Respiratory_Infections_Childhood,COPD_Diagnosis,Age_Category,BMI_category,Gender_encoded,Occupation_Family_History_Interaction
0,Former,,,,Lalitpur,,0,adult,overweight,1,
1,Never,,,,Pokhara,,0,old,obese,1,
2,Former,,,,Pokhara,,0,adult,overweight,1,
3,Current,,,,Kathmandu,,1,adult,overweight,0,
4,Never,,,,Pokhara,,0,middle_aged,overweight,1,


In [17]:
# Splitting the data into features and target variable
# Adjust 'COPD_Diagnosis' if your target column is named differently
X = df.drop(columns=['COPD_Diagnosis'])
y = df['COPD_Diagnosis']


In [18]:
pipeline= pickle.load(open('pipeline2.pkl','rb'))

In [19]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
# Transforming the training and testing data using the pipeline
X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

 'Respiratory_Infections_Childhood'
 'Occupation_Family_History_Interaction']. At least one non-missing value is needed for imputation with strategy='mean'.
 'Respiratory_Infections_Childhood'
 'Occupation_Family_History_Interaction']. At least one non-missing value is needed for imputation with strategy='mean'.


In [21]:
# Fitting the models and evaluating performance
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Increased iterations for convergence
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [22]:
# Dictionary to store model scores
model_scores = {}

# Training and evaluating each model
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    
    # Calculating accuracy and classification report
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    # Storing results
    model_scores[model_name] = accuracy
    print(f"Accuracy of {model_name}: {accuracy * 100:.2f}%")
    print(f"Classification Report for {model_name}:\n{report}")

# Display model scores
print("\nModel Scores:", model_scores)

Training Decision Tree...
Accuracy of Decision Tree: 85.00%
Classification Report for Decision Tree:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       134
           1       0.73      0.86      0.79        66

    accuracy                           0.85       200
   macro avg       0.83      0.85      0.84       200
weighted avg       0.86      0.85      0.85       200

Training Logistic Regression...
Accuracy of Logistic Regression: 85.00%
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       134
           1       0.73      0.86      0.79        66

    accuracy                           0.85       200
   macro avg       0.83      0.85      0.84       200
weighted avg       0.86      0.85      0.85       200

Training Random Forest...
Accuracy of Random Forest: 85.00%
Classification Report for Random Forest:
              precision  

In [23]:
# Setting up the Random Forest model with GridSearchCV for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initializing the RandomForestClassifier
rf = RandomForestClassifier(random_state=42)

# Setting up GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fitting the grid search to the training data
grid_search.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [24]:

# Extracting the best model from grid search
best_rf = grid_search.best_estimator_

# Evaluating the best model on the test set
y_pred = best_rf.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
print(f"Best Random Forest Model Accuracy: {accuracy * 100:.2f}%")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

Best Random Forest Model Accuracy: 85.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.84      0.88       134
           1       0.73      0.86      0.79        66

    accuracy                           0.85       200
   macro avg       0.83      0.85      0.84       200
weighted avg       0.86      0.85      0.85       200



In [25]:
# Exporting the best model as a .pkl file for future use
with open('random_forest_model2.pkl', 'wb') as model_file:
    pickle.dump(best_rf, model_file)