In [1]:
import pandas as pd
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from joblib import dump

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("Datasets/survey lung cancer.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [3]:
encoder = LabelEncoder()
scaler = StandardScaler()

data["GENDER"] = encoder.fit_transform(data["GENDER"])
data["LUNG_CANCER"] = encoder.fit_transform(data["LUNG_CANCER"])

data.head()
data.corr()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
GENDER,1.0,0.021306,0.036277,-0.212959,-0.152127,-0.275564,-0.204606,-0.08356,0.154251,0.141207,0.454268,0.133303,-0.064911,-0.078161,0.362958,0.067254
AGE,0.021306,1.0,-0.084475,0.005205,0.05317,0.018685,-0.012642,0.012614,0.02799,0.055011,0.058985,0.16995,-0.017513,-0.00127,-0.018104,0.089465
SMOKING,0.036277,-0.084475,1.0,-0.014585,0.160267,-0.042822,-0.141522,-0.029575,0.001913,-0.129426,-0.050623,-0.129471,0.061264,0.030718,0.120117,0.058179
YELLOW_FINGERS,-0.212959,0.005205,-0.014585,1.0,0.565829,0.323083,0.041122,-0.118058,-0.1443,-0.078515,-0.289025,-0.01264,-0.105944,0.345904,-0.104829,0.181339
ANXIETY,-0.152127,0.05317,0.160267,0.565829,1.0,0.216841,-0.009678,-0.188538,-0.16575,-0.191807,-0.16575,-0.225644,-0.144077,0.489403,-0.113634,0.144947
PEER_PRESSURE,-0.275564,0.018685,-0.042822,0.323083,0.216841,1.0,0.048515,0.078148,-0.0818,-0.068771,-0.159973,-0.089019,-0.220175,0.36659,-0.094828,0.186388
CHRONIC DISEASE,-0.204606,-0.012642,-0.141522,0.041122,-0.009678,0.048515,1.0,-0.110529,0.106386,-0.049967,0.00215,-0.175287,-0.026459,0.075176,-0.036938,0.110891
FATIGUE,-0.08356,0.012614,-0.029575,-0.118058,-0.188538,0.078148,-0.110529,1.0,0.003056,0.141937,-0.191377,0.146856,0.441745,-0.13279,-0.010832,0.150673
ALLERGY,0.154251,0.02799,0.001913,-0.1443,-0.16575,-0.0818,0.106386,0.003056,1.0,0.173867,0.344339,0.189524,-0.030056,-0.061508,0.239433,0.327766
WHEEZING,0.141207,0.055011,-0.129426,-0.078515,-0.191807,-0.068771,-0.049967,0.141937,0.173867,1.0,0.265659,0.374265,0.037834,0.069027,0.14764,0.2493


In [4]:
X = data.drop(columns=['LUNG_CANCER'])
y = data['LUNG_CANCER']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
dup = data[data.duplicated()].shape[0]
print(f"There are {dup} duplicate entries among {data.shape[0]} entries in this dataset.")

data.drop_duplicates(keep='first',inplace=True)
print(f"\nAfter removing duplicate entries there are {data.shape[0]} entries in this dataset.")

There are 33 duplicate entries among 309 entries in this dataset.

After removing duplicate entries there are 276 entries in this dataset.


In [6]:
regularization_parameter = 0.003

neural_model = Sequential([
    Input(shape=(X_train.shape[-1],)),
    Dense(units=32, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dropout(0.3),

    Dense(units=64, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dense(units=128, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dropout(0.3),

    Dense(units=16,activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dense(units=1, activation="sigmoid"), ])

In [7]:
adam = Adam(learning_rate=0.002)
neural_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
neural_model.summary()

In [8]:
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

history = neural_model.fit(X_train, y_train, epochs=200, verbose=1, batch_size=64,
                           validation_data=(X_test, y_test), callbacks=[early_stopping])

print(f"Best validation accuracy: {max(history.history['val_accuracy']) * 100:.2f}%")

Epoch 1/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.7922 - loss: 4.3493 - val_accuracy: 0.8710 - val_loss: 4.1308
Epoch 2/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8446 - loss: 4.0250 - val_accuracy: 0.8710 - val_loss: 3.8350
Epoch 3/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.8670 - loss: 3.7246 - val_accuracy: 0.8710 - val_loss: 3.5680
Epoch 4/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8670 - loss: 3.4746 - val_accuracy: 0.8710 - val_loss: 3.3333
Epoch 5/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8482 - loss: 3.2965 - val_accuracy: 0.8710 - val_loss: 3.1099
Epoch 6/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8696 - loss: 3.0293 - val_accuracy: 0.8710 - val_loss: 2.8935
Epoch 7/200
[1m4/4[0m [32m━━━━━━━━━━

In [10]:
y_pred = neural_model.predict(X_test)
pred = (y_pred > 0.5).astype(int)

accuracy = classification_report(y_test, pred, output_dict=True)['accuracy']
print("Test Accuracy:", accuracy, '\n', classification_report(y_test, pred))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Test Accuracy: 0.9516129032258065 
               precision    recall  f1-score   support

           0       0.78      0.88      0.82         8
           1       0.98      0.96      0.97        54

    accuracy                           0.95        62
   macro avg       0.88      0.92      0.90        62
weighted avg       0.95      0.95      0.95        62



In [11]:
neural_model.save('Models/lung_classifier.h5')

metadata = {"std_scaler": scaler}
dump(metadata, "Models/lung_metadata.pkl")



['Models/lung_metadata.pkl']

#### Now let's save a few samples for testing later:   

In [12]:
subset = data.groupby('LUNG_CANCER', group_keys=False).apply(lambda x: x.sample(min(len(x), 5)))
subset = subset.drop(columns=['LUNG_CANCER'])

for i in range(subset.shape[0]):
    subset.iloc[i].to_csv(f"Test Samples/lung/sample_{i+1}.csv", index=False)

let's test the model on the saved samples:  

In [13]:
for i in range(subset.shape[0]):
    sample = pd.read_csv(f"Test Samples/lung/sample_{i+1}.csv")
    sample = scaler.transform(sample.to_numpy().reshape(1, -1))
    pred = neural_model.predict(sample)
    pred = (pred > 0.5).astype(int)
    print(f"Sample {i+1} prediction:", pred)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Sample 1 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sample 2 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Sample 3 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Sample 4 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sample 5 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
Sample 6 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Sample 7 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sample 8 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Sample 9 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Sample 10 prediction: [[1]]


#### It seems to be working correctly, than means our work here is finally done!  

### End of Notebook