In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
data = pd.read_csv("/content/survey lung cancer.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [6]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

encoder = LabelEncoder()
scaler = MinMaxScaler(feature_range=(1, 2))

data["GENDER"] = encoder.fit_transform(data["GENDER"])
data["LUNG_CANCER"] = encoder.fit_transform(data["LUNG_CANCER"])

data["AGE"] = scaler.fit_transform(data[["AGE"]])

data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,1.727273,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,1.80303,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,0,1.575758,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,1.636364,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,0,1.636364,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [7]:
from sklearn.model_selection import train_test_split

X = data.drop(columns=['LUNG_CANCER'])
y = data['LUNG_CANCER']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=data["LUNG_CANCER"], random_state=40)

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
parameter = {'n_estimators':[10, 50, 100, 200],
             'max_depth':[None, 10, 20, 30, 40],
             'min_samples_split':[2, 5, 10],
             'min_samples_leaf':[1, 2, 4],
             'bootstrap':[True, False]}

In [15]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score , classification_report

grid = GridSearchCV(estimator=rf, param_grid=parameter, cv=5, verbose=2, n_jobs=1)
grid.fit(X_train,y_train)
best_grid=grid.best_estimator_
ypred_grid=best_grid.predict(X_test)

print("best parameters found by grid search:", grid.best_params_)
print("grid search accuracy :", accuracy_score(y_test,ypred_grid))
print("\ngrid search classification report :\n", classification_report(y_test,ypred_grid))

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=

In [17]:
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from collections import Counter


In [18]:
# Create a synthetic imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.9, 0.1],  # Imbalanced
                           n_informative=3, n_redundant=1,
                           flip_y=0, n_features=20,
                           n_clusters_per_class=1, n_samples=1000, random_state=42)

# Check the class distribution
print(f"Original class distribution: {Counter(y)}")


Original class distribution: Counter({0: 900, 1: 100})


In [19]:
# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Check the resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")


Resampled class distribution: Counter({1: 216, 0: 216})


In [25]:
# Train a classifier
clf = RandomForestClassifier(random_state=30)
clf.fit(X_resampled, y_resampled)

# Evaluate on the original test set
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59         8
           1       0.94      0.93      0.93        54

    accuracy                           0.89        62
   macro avg       0.75      0.78      0.76        62
weighted avg       0.89      0.89      0.89        62



In [26]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_resampled, y_resampled)

# Evaluate on the original test set
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.44      0.50      0.47         8
           1       0.92      0.91      0.92        54

    accuracy                           0.85        62
   macro avg       0.68      0.70      0.69        62
weighted avg       0.86      0.85      0.86        62



In [28]:
from imblearn.over_sampling import ADASYN
# Create a synthetic imbalanced dataset
X, y = make_classification(n_classes=2, class_sep=2,
                           weights=[0.9, 0.1],  # Imbalanced
                           n_informative=3, n_redundant=1,
                           flip_y=0, n_features=20,
                           n_clusters_per_class=1, n_samples=1000, random_state=42)

# Check the class distribution
print(f"Original class distribution: {Counter(y)}")

Original class distribution: Counter({0: 900, 1: 100})


In [30]:

ada = ADASYN(random_state=42)

# Resample the training data
X_resampled, y_resampled = ada.fit_resample(X_train, y_train)

# Check the resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")


Resampled class distribution: Counter({0: 218, 1: 216})


In [35]:
clf2 = RandomForestClassifier(class_weight='balanced',random_state=38)
clf2.fit(X_resampled, y_resampled)

# Evaluate on the original test set
y_pred = clf2.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.62      0.59         8
           1       0.94      0.93      0.93        54

    accuracy                           0.89        62
   macro avg       0.75      0.78      0.76        62
weighted avg       0.89      0.89      0.89        62



In [36]:
import xgboost as xgb
from xgboost import XGBClassifier

X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=15, n_redundant=5,
                           n_classes=2, weights=[0.8, 0.2],
                           random_state=42)



In [37]:
# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(
    scale_pos_weight=7,  # Adjust for class imbalance (majority/minority class ratio)
    use_label_encoder=False,  # Avoids a warning in XGBoost >= 1.3
    eval_metric='logloss',  # Evaluation metric
    random_state=42
)

# Train the classifier
xgb_clf.fit(X_train, y_train)
# Make predictions on the test set
y_pred = xgb_clf.predict(X_test)
y_pred_proba = xgb_clf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8387096774193549
Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.12      0.17         8
           1       0.88      0.94      0.91        54

    accuracy                           0.84        62
   macro avg       0.56      0.53      0.54        62
weighted avg       0.80      0.84      0.81        62



Parameters: { "use_label_encoder" } are not used.

