In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
import pickle

In [2]:
df= pd.read_csv('Lung Cancer Dataset.csv')
df.head()

Unnamed: 0,AGE,GENDER,SMOKING,FINGER_DISCOLORATION,MENTAL_STRESS,EXPOSURE_TO_POLLUTION,LONG_TERM_ILLNESS,ENERGY_LEVEL,IMMUNE_WEAKNESS,BREATHING_ISSUE,ALCOHOL_CONSUMPTION,THROAT_DISCOMFORT,OXYGEN_SATURATION,CHEST_TIGHTNESS,FAMILY_HISTORY,SMOKING_FAMILY_HISTORY,STRESS_IMMUNE,PULMONARY_DISEASE
0,68,1,1,1,1,1,0,57.831178,0,0,1,1,95.977287,1,0,0,0,NO
1,81,1,1,0,0,1,1,47.694835,1,1,0,1,97.184483,0,0,0,0,YES
2,58,1,1,0,0,0,0,59.577435,0,1,1,0,94.974939,0,0,0,0,NO
3,44,0,1,0,1,1,0,59.785767,0,1,0,1,95.1879,0,0,0,0,YES
4,72,0,1,1,1,1,1,59.733941,0,1,0,1,93.503008,0,0,0,0,YES


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   AGE                     5000 non-null   int64  
 1   GENDER                  5000 non-null   int64  
 2   SMOKING                 5000 non-null   int64  
 3   FINGER_DISCOLORATION    5000 non-null   int64  
 4   MENTAL_STRESS           5000 non-null   int64  
 5   EXPOSURE_TO_POLLUTION   5000 non-null   int64  
 6   LONG_TERM_ILLNESS       5000 non-null   int64  
 7   ENERGY_LEVEL            5000 non-null   float64
 8   IMMUNE_WEAKNESS         5000 non-null   int64  
 9   BREATHING_ISSUE         5000 non-null   int64  
 10  ALCOHOL_CONSUMPTION     5000 non-null   int64  
 11  THROAT_DISCOMFORT       5000 non-null   int64  
 12  OXYGEN_SATURATION       5000 non-null   float64
 13  CHEST_TIGHTNESS         5000 non-null   int64  
 14  FAMILY_HISTORY          5000 non-null   

In [4]:
df['PULMONARY_DISEASE']= df['PULMONARY_DISEASE'].map({'NO':0, 'YES':1})

In [5]:
X=df.drop(['PULMONARY_DISEASE'], axis=1)
y=df.PULMONARY_DISEASE

In [6]:
XTrain,XTest, yTrain , yTest= train_test_split(X, y, test_size= 0.3, random_state=42)

In [7]:
scaler= MinMaxScaler()
XTrainScaled=scaler.fit_transform(XTrain)
XTestScaled=scaler.transform(XTest)

In [8]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(XTrainScaled, yTrain)



# Logistic Regression

In [14]:
param_grid_lr = {
    'penalty': ['l1', 'l2', 'elasticnet', None],  
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['lbfgs', 'liblinear', 'saga'],  
    'max_iter': [100, 200, 500]
}

lr= LogisticRegression()
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_search_lr.best_params_)
print("Best score:", grid_search_lr.best_score_)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best parameters: {'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'saga'}
Best score: 0.8935417908213557


450 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DIPESH LOHCHAB\AppData\Roaming\Python\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DIPESH LOHCHAB\AppData\Roaming\Python\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DIPESH LOHCHAB\AppData\Roaming\Python\Python312\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.s

In [15]:
lr= LogisticRegression(C=1, max_iter= 100, penalty='l1', solver= 'saga')
lr.fit(X_train_resampled, y_train_resampled)
lr_pred= lr.predict(XTestScaled)
print(classification_report(yTest, lr_pred))

              precision    recall  f1-score   support

           0       0.93      0.86      0.89       859
           1       0.83      0.91      0.87       641

    accuracy                           0.88      1500
   macro avg       0.88      0.88      0.88      1500
weighted avg       0.89      0.88      0.88      1500



# Decision Tree

In [17]:
param_grid_dt = {
    'criterion': ['gini', 'entropy', 'log_loss'],  
    'max_depth': [None, 5, 10, 20, 50],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['sqrt', 'log2', None]
}

dt= DecisionTreeClassifier()
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_dt.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_search_dt.best_params_)
print("Best score:", grid_search_dt.best_score_)

Fitting 5 folds for each of 405 candidates, totalling 2025 fits
Best parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best score: 0.8935474395654985


In [19]:
dt= DecisionTreeClassifier(criterion='gini', max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=10)
dt.fit(X_train_resampled, y_train_resampled)
dt_pred= dt.predict(XTestScaled)
print(classification_report(yTest, dt_pred))


              precision    recall  f1-score   support

           0       0.90      0.89      0.89       859
           1       0.85      0.87      0.86       641

    accuracy                           0.88      1500
   macro avg       0.88      0.88      0.88      1500
weighted avg       0.88      0.88      0.88      1500



# Random forest

In [20]:
param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],  
    'criterion': ['gini', 'entropy'],  
    'max_depth': [None, 10, 20, 50],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4],  
    'max_features': ['sqrt', 'log2', None],  
    'bootstrap': [True, False]
}
rf= RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train_resampled, y_train_resampled)

print("Best parameters:", grid_search_rf.best_params_)
print("Best score:", grid_search_rf.best_score_)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits
Best parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Best score: 0.9251555522918367


In [22]:
rf= RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=20,
                           max_features='log2', min_samples_leaf=1, min_samples_split=2, n_estimators=500)
rf.fit(X_train_resampled, y_train_resampled)
rf_pred= rf.predict(XTestScaled)
print(classification_report(yTest, rf_pred))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       859
           1       0.90      0.88      0.89       641

    accuracy                           0.91      1500
   macro avg       0.91      0.90      0.90      1500
weighted avg       0.91      0.91      0.91      1500



In [23]:
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_classifier.fit(X_train_resampled, y_train_resampled)
xgb_pred = xgb_classifier.predict(XTestScaled)
print(classification_report(yTest, xgb_pred))

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.90      0.91      0.90       859
           1       0.88      0.87      0.87       641

    accuracy                           0.89      1500
   macro avg       0.89      0.89      0.89      1500
weighted avg       0.89      0.89      0.89      1500



In [24]:
catboost_classifier = CatBoostClassifier(iterations=1000, learning_rate=0.01, depth=10, verbose=100)
catboost_classifier.fit(X_train_resampled, y_train_resampled)
cat_pred = catboost_classifier.predict(XTestScaled)
print(classification_report(yTest, cat_pred))

0:	learn: 0.6823663	total: 206ms	remaining: 3m 25s
100:	learn: 0.2728336	total: 3.22s	remaining: 28.6s
200:	learn: 0.1980255	total: 6.27s	remaining: 24.9s
300:	learn: 0.1615456	total: 9.32s	remaining: 21.7s
400:	learn: 0.1337232	total: 12.4s	remaining: 18.5s
500:	learn: 0.1126284	total: 15.4s	remaining: 15.4s
600:	learn: 0.0971844	total: 18.4s	remaining: 12.2s
700:	learn: 0.0838729	total: 21.4s	remaining: 9.13s
800:	learn: 0.0734264	total: 24.3s	remaining: 6.05s
900:	learn: 0.0655711	total: 27.3s	remaining: 3s
999:	learn: 0.0592735	total: 30.3s	remaining: 0us
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       859
           1       0.90      0.88      0.89       641

    accuracy                           0.91      1500
   macro avg       0.90      0.90      0.90      1500
weighted avg       0.91      0.91      0.91      1500



In [25]:
import pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [26]:
import sklearn
sklearn.__version__

'1.6.0'