In [1]:
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv("Datasets/survey lung cancer.csv")
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [3]:
encoder = LabelEncoder()
scaler = StandardScaler()

data["GENDER"] = encoder.fit_transform(data["GENDER"])
data["LUNG_CANCER"] = encoder.fit_transform(data["LUNG_CANCER"])

data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,0,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,0,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [4]:
X = data.drop(columns=['LUNG_CANCER'])
y = data['LUNG_CANCER']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    stratify=data["LUNG_CANCER"], random_state=42)
scaler.fit(X_train)
X_train = (X_train - scaler.mean_) / scaler.scale_
X_test  = (X_test - scaler.mean_)  / scaler.scale_

In [5]:
dup = data[data.duplicated()].shape[0]
print(f"There are {dup} duplicate entries among {data.shape[0]} entries in this dataset.")

data.drop_duplicates(keep='first',inplace=True)
print(f"\nAfter removing duplicate entries there are {data.shape[0]} entries in this dataset.")

There are 33 duplicate entries among 309 entries in this dataset.

After removing duplicate entries there are 276 entries in this dataset.


In [6]:
regularization_parameter = 0.003

neural_model = Sequential([tf.keras.layers.Dense(units=32, input_dim=(X_train.shape[-1]), activation="relu", kernel_regularizer = regularizers.l1(regularization_parameter)),
                    tf.keras.layers.Dropout(0.3),
                    tf.keras.layers.Dense(units=64, activation="relu", kernel_regularizer = regularizers.l1(regularization_parameter)),
                    tf.keras.layers.Dense(units=128, activation="relu", kernel_regularizer = regularizers.l1(regularization_parameter)),
                    tf.keras.layers.Dropout(0.3),
                    tf.keras.layers.Dense(units=16,activation="relu", kernel_regularizer = regularizers.l1(regularization_parameter)),
                    tf.keras.layers.Dense(units=1, activation="sigmoid")
                    ])

print(neural_model.summary())

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


None


In [7]:
early_stopping = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

neural_model.compile(optimizer = Adam(learning_rate=0.001),
                     loss="binary_crossentropy",
                     metrics=["accuracy"])

history = neural_model.fit(X_train, y_train, epochs=150, verbose=1, batch_size=64,
                           validation_data=(X_test, y_test), callbacks=[early_stopping])

print(f"Best validation accuracy: {max(history.history['val_accuracy']) * 100:.2f}%")

Epoch 1/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 253ms/step - accuracy: 0.8282 - loss: 4.4068 - val_accuracy: 0.8710 - val_loss: 4.2470
Epoch 2/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.8680 - loss: 4.2241 - val_accuracy: 0.8710 - val_loss: 4.0820
Epoch 3/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.8972 - loss: 4.0285 - val_accuracy: 0.8710 - val_loss: 3.9389
Epoch 4/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.8956 - loss: 3.8975 - val_accuracy: 0.8710 - val_loss: 3.8121
Epoch 5/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.8628 - loss: 3.8061 - val_accuracy: 0.8710 - val_loss: 3.6935
Epoch 6/150
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.8743 - loss: 3.6687 - val_accuracy: 0.8710 - val_loss: 3.5790
Epoch 7/150
[1m4/4[0m [32m━━━━━━━━━━

In [12]:
y_pred = neural_model.predict(X_test)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 


In [13]:
pred = (y_pred > 0.5).astype(int)

accuracy = classification_report(y_test, pred, output_dict=True)['accuracy']
print("Test Accuracy:", accuracy, '\n', classification_report(y_test, pred))


Test Accuracy: 0.9354838709677419 
               precision    recall  f1-score   support

           0       0.75      0.75      0.75         8
           1       0.96      0.96      0.96        54

    accuracy                           0.94        62
   macro avg       0.86      0.86      0.86        62
weighted avg       0.94      0.94      0.94        62



In [14]:
from joblib import dump

neural_model.save('Models/Lung/model.h5')

metadata = {"age_mean": scaler.mean_, "age_std": scaler.scale_}
dump(metadata, "Models/Lung/metadata.pkl")



['Models/Lung/metadata.pkl']

### End of Notebook