In [2]:
import numpy
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
pipe = pickle.load(open('pipe.pkl','rb'))

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [5]:
df=pd.read_csv('final_dataset_after_pipelining.csv')

In [6]:
x=df.drop(columns=['COPD_Diagnosis'])
y=df['COPD_Diagnosis']

In [7]:
x.shape

(1000, 10)

In [8]:
X_transformed = pipe.transform(x)

In [9]:
X_transformed.shape

(1000, 8)

In [10]:
X_transformed[1]

array([0., 0., 0., 0., 0., 0., 1., 0.])

In [11]:
x.head()

Unnamed: 0,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,Location,Respiratory_Infections_Childhood,Age_Category,BMI_category,Air_Pollution_Level_category,Gender_encoded
0,Former,1,1,1,Lalitpur,0,adult,overweight,Satisfactory,1
1,Never,1,0,0,Pokhara,1,old,obese,Moderate,1
2,Former,0,0,1,Pokhara,1,adult,overweight,Moderate,1
3,Current,1,0,0,Kathmandu,0,adult,overweight,Poor,0
4,Never,0,0,0,Pokhara,1,middle_aged,overweight,Moderate,1


In [12]:
x.tail()

Unnamed: 0,Smoking_Status,Biomass_Fuel_Exposure,Occupational_Exposure,Family_History_COPD,Location,Respiratory_Infections_Childhood,Age_Category,BMI_category,Air_Pollution_Level_category,Gender_encoded
995,Current,1,1,1,Bhaktapur,0,old,normal,Moderate,1
996,Never,0,1,0,Bhaktapur,0,middle_aged,overweight,Moderate,1
997,Current,0,0,1,Butwal,1,adult,obese,Moderate,0
998,Former,0,1,1,Kathmandu,0,too_old,normal,Poor,0
999,Former,0,0,0,Kathmandu,0,adult,overweight,Moderate,0


In [13]:
X_transformed[0:7]

array([[1., 0., 0., 0., 1., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 1.],
       [2., 0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [2., 0., 0., 1., 0., 0., 0., 0.],
       [2., 0., 0., 0., 0., 0., 0., 0.]])

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X_transformed,y,test_size=0.2,random_state=42)

In [15]:
# Function to train, evaluate, and save models
def train_evaluate_save(model, model_name):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Print evaluation metrics
    print(f"Results for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.7f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

 # Save the trained model as a pickle file
    model_filename = f'{model_name}_model.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(model, file)
    print(f"{model_name} saved as {model_filename}\n")
   

In [16]:
# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
train_evaluate_save(decision_tree, "decision_tree")

Results for decision_tree:
Accuracy: 1.0000000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

decision_tree saved as decision_tree_model.pkl



In [17]:
# Random Forest
random_forest = RandomForestClassifier(random_state=42)
train_evaluate_save(random_forest, "Random_Forest")

# Logistic Regression
logistic_regression = LogisticRegression(random_state=42, max_iter=1000)
train_evaluate_save(logistic_regression, "Logistic_Regression")

Results for Random_Forest:
Accuracy: 1.0000000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       134
           1       1.00      1.00      1.00        66

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

Random_Forest saved as Random_Forest_model.pkl

Results for Logistic_Regression:
Accuracy: 0.9550000
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       134
           1       0.93      0.94      0.93        66

    accuracy                           0.95       200
   macro avg       0.95      0.95      0.95       200
weighted avg       0.96      0.95      0.96       200

Logistic_Regression saved as Logistic_Regression_model.pkl



In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
param_grid={
    'n_estimators':[50,100,200],
    'max_depth':[None,10,20,30],
    'min_samples_split':[2,5,10]
}

In [20]:
grid_search=GridSearchCV(estimator=RandomForestClassifier(),param_grid=param_grid,cv=5,n_jobs=-1, scoring='accuracy')

In [21]:
grid_search.fit(X_train,y_train)

In [22]:
print(f"Best Prarmeters: {grid_search.best_params_}")
best_model=grid_search.best_estimator_

Best Prarmeters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}


In [23]:
#save the best model
with open('Best_Random_Forest_Model.pkl','wb') as f:
    pickle.dump(best_model,f)

print('Model refinement completed and best model saved')

Model refinement completed and best model saved
