In [1]:
import pandas as pd
import warnings

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from joblib import dump

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("Datasets/survey lung cancer.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [3]:
encoder = LabelEncoder()
scaler = StandardScaler()

data["GENDER"] = encoder.fit_transform(data["GENDER"])
data["LUNG_CANCER"] = encoder.fit_transform(data["LUNG_CANCER"])

data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [4]:
X = data.drop(columns=['LUNG_CANCER'])
y = data['LUNG_CANCER']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=y, random_state=42)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
dup = data[data.duplicated()].shape[0]
print(f"There are {dup} duplicate entries among {data.shape[0]} entries in this dataset.")

data.drop_duplicates(keep='first',inplace=True)
print(f"\nAfter removing duplicate entries there are {data.shape[0]} entries in this dataset.")

There are 33 duplicate entries among 309 entries in this dataset.

After removing duplicate entries there are 276 entries in this dataset.


In [11]:
regularization_parameter = 0.003

neural_model = Sequential([
    Input(shape=(X_train.shape[-1],)),
    Dense(units=32, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dropout(0.3),

    Dense(units=64, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dense(units=128, activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dropout(0.3),

    Dense(units=16,activation="relu", kernel_regularizer=regularizers.l1(regularization_parameter)),
    Dense(units=1, activation="sigmoid"), ])

In [12]:
adam = Adam(learning_rate=0.002)
neural_model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
neural_model.summary()

In [13]:
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

history = neural_model.fit(X_train, y_train, epochs=200, verbose=1, batch_size=64,
                           validation_data=(X_test, y_test), callbacks=[early_stopping])

print(f"Best validation accuracy: {max(history.history['val_accuracy']) * 100:.2f}%")

Epoch 1/200


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 230ms/step - accuracy: 0.5839 - loss: 4.4529 - val_accuracy: 0.8710 - val_loss: 4.1162
Epoch 2/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.8685 - loss: 4.0577 - val_accuracy: 0.8710 - val_loss: 3.8577
Epoch 3/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8743 - loss: 3.7958 - val_accuracy: 0.8710 - val_loss: 3.6354
Epoch 4/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.8915 - loss: 3.5435 - val_accuracy: 0.8710 - val_loss: 3.4183
Epoch 5/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8805 - loss: 3.3462 - val_accuracy: 0.8710 - val_loss: 3.2029
Epoch 6/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.8925 - loss: 3.1484 - val_accuracy: 0.8710 - val_loss: 2.9958
Epoch 7/200
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[

In [14]:
y_pred = neural_model.predict(X_test)
pred = (y_pred > 0.5).astype(int)

accuracy = classification_report(y_test, pred, output_dict=True)['accuracy']
print("Test Accuracy:", accuracy, '\n', classification_report(y_test, pred))


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 91ms/step
Test Accuracy: 0.9516129032258065 
               precision    recall  f1-score   support

           0       0.78      0.88      0.82         8
           1       0.98      0.96      0.97        54

    accuracy                           0.95        62
   macro avg       0.88      0.92      0.90        62
weighted avg       0.95      0.95      0.95        62



In [15]:
neural_model.save('Models/lung_calassifier.h5')

metadata = {"std_scaler": scaler}
dump(metadata, "Models/lung_metadata.pkl")



['Models/lung_metadata.pkl']

#### Now let's save a few samples for testing later:   

In [11]:
subset = data.groupby('LUNG_CANCER', group_keys=False).apply(lambda x: x.sample(min(len(x), 5)))
subset = subset.drop(columns=['LUNG_CANCER'])

for i in range(subset.shape[0]):
    subset.iloc[i].to_csv(f"Test Samples/lung/sample_{i+1}.csv", index=False)

let's test the model on the saved samples:  

In [12]:
for i in range(subset.shape[0]):
    sample = pd.read_csv(f"Test Samples/lung/sample_{i+1}.csv")
    sample = scaler.transform(sample.to_numpy().reshape(1, -1))
    pred = neural_model.predict(sample)
    pred = (pred > 0.5).astype(int)
    print(f"Sample {i+1} prediction:", pred)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Sample 1 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Sample 2 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Sample 3 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Sample 4 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Sample 5 prediction: [[0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Sample 6 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Sample 7 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Sample 8 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Sample 9 prediction: [[1]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Sample 10 prediction: [[1]]


#### It seems to be working correctly, than means our work here is finally done!  

### End of Notebook