**Importing the Dependencies**

In [62]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import warnings
warnings.filterwarnings('ignore')

**About Dataset**

Gender: M(male)=0, F(female)=1

Age: Age of the patient

Smoking: YES=2 , NO=1.

Yellow fingers: YES=2 , NO=1.

Anxiety: YES=2 , NO=1.

Peer_pressure: YES=2 , NO=1.

Chronic Disease: YES=2 , NO=1.

Fatigue: YES=2 , NO=1.

Allergy: YES=2 , NO=1.

Wheezing: YES=2 , NO=1.

Alcohol: YES=2 , NO=1.

Coughing: YES=2 , NO=1.

Shortness of Breath: YES=2 , NO=1.

Swallowing Difficulty: YES=2 , NO=1.

Chest pain: YES=2 , NO=1.

Lung Cancer: YES=1 , NO=0.

**Data Collection and Processing**

In [63]:
# loading the csv data to a Pandas DataFrame
data = pd.read_csv('survey lung cancer.csv')

In [64]:
# print first 5 rows of the dataset
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [65]:
# number of rows and columns in the dataset
data.shape

(309, 16)

In [66]:
# getting some info about the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [67]:
# checking for missing values
data.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [68]:
# checking number of rows and columns in the dataset again
data.shape

(309, 16)

In [69]:
# statistical measures about the data
data.describe()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,62.673139,1.563107,1.569579,1.498382,1.501618,1.504854,1.673139,1.556634,1.556634,1.556634,1.579288,1.640777,1.469256,1.556634
std,8.210301,0.496806,0.495938,0.500808,0.500808,0.500787,0.469827,0.497588,0.497588,0.497588,0.494474,0.480551,0.499863,0.497588
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [70]:
# checking the distribution of Target Variable
data['LUNG_CANCER'].value_counts()

LUNG_CANCER
YES    270
NO      39
Name: count, dtype: int64

In [71]:
# replacing Target values with 0 and 1 instead of YES and NO.
data['LUNG_CANCER'].replace('YES',1,inplace=True)
data['LUNG_CANCER'].replace('NO',0,inplace=True)

In [72]:
# replacing Gender values with 0 and 1 instead of male(M) and female(F).
data['GENDER'].replace('F',1,inplace=True)
data['GENDER'].replace('M',0,inplace=True)

In [73]:
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0,69,1,2,2,1,1,2,1,2,2,2,2,2,2,1
1,0,74,2,1,1,1,2,2,2,1,1,1,2,2,2,1
2,1,59,1,1,1,2,1,2,1,2,1,2,2,1,2,0
3,0,63,2,2,2,1,1,1,1,1,2,1,1,2,2,0
4,1,63,1,2,1,1,1,1,1,2,1,2,2,1,1,0


In [74]:
data['LUNG_CANCER'].value_counts()

LUNG_CANCER
1    270
0     39
Name: count, dtype: int64

**Lung Cancer: YES(1), NO(0).**

**Splitting the Features and Target**

In [75]:
X = data.drop(columns='LUNG_CANCER', axis=1)
Y = data['LUNG_CANCER']

In [76]:
print(X)

     GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0         0   69        1               2        2              1   
1         0   74        2               1        1              1   
2         1   59        1               1        1              2   
3         0   63        2               2        2              1   
4         1   63        1               2        1              1   
..      ...  ...      ...             ...      ...            ...   
304       1   56        1               1        1              2   
305       0   70        2               1        1              1   
306       0   58        2               1        1              1   
307       0   67        2               1        2              1   
308       0   62        1               1        1              2   

     CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  \
0                  1         2         1         2                  2   
1                  2     

In [77]:
print(Y)

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64


**Splitting the Data into Training data & Test Data**

In [78]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.1, random_state=2)

In [79]:
print(X.shape, X_train.shape, X_test.shape)

(309, 15) (278, 15) (31, 15)


In [80]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [81]:
print(X_train)

[[-0.95772719 -0.68687706 -1.17305941 ...  0.73786479  1.08253175
   0.90387691]
 [-0.95772719  0.03733971 -1.17305941 ...  0.73786479 -0.92376043
  -1.10634533]
 [-0.95772719 -0.32476867 -1.17305941 ...  0.73786479 -0.92376043
   0.90387691]
 ...
 [ 1.04413867 -0.92828265  0.85247174 ... -1.35526185 -0.92376043
  -1.10634533]
 [-0.95772719  0.76155649  0.85247174 ...  0.73786479  1.08253175
  -1.10634533]
 [-0.95772719 -0.32476867  0.85247174 ...  0.73786479 -0.92376043
   0.90387691]]


In [82]:
print(X_test)

[[ 1.04413867 -0.80757986 -1.17305941 -1.14746097  1.02920322  1.
   1.          0.69189841  0.89081333  0.91709302  0.89081333 -1.17305941
   0.73786479  1.08253175  0.90387691]
 [ 1.04413867 -0.56617426  0.85247174  0.87148934  1.02920322  1.
  -1.          0.69189841 -1.12256964 -1.09040193 -1.12256964  0.85247174
   0.73786479  1.08253175 -1.10634533]
 [-0.95772719  0.88225928  0.85247174 -1.14746097 -0.97162541 -1.
  -1.          0.69189841 -1.12256964  0.91709302  0.89081333  0.85247174
   0.73786479 -0.92376043  0.90387691]
 [ 1.04413867 -0.08336308  0.85247174  0.87148934  1.02920322  1.
   1.          0.69189841 -1.12256964 -1.09040193 -1.12256964  0.85247174
   0.73786479 -0.92376043 -1.10634533]
 [ 1.04413867  1.60647606 -1.17305941  0.87148934  1.02920322  1.
   1.          0.69189841  0.89081333  0.91709302 -1.12256964  0.85247174
   0.73786479  1.08253175  0.90387691]
 [ 1.04413867 -0.80757986 -1.17305941  0.87148934 -0.97162541 -1.
   1.          0.69189841  0.89081333  

In [83]:
print(Y_test)

158    1
285    1
264    1
276    1
76     1
66     1
65     1
46     1
259    1
159    0
27     0
153    0
235    1
211    1
161    1
48     1
113    1
303    1
58     1
93     1
291    1
55     1
40     1
160    1
61     0
150    1
287    1
87     1
100    1
90     1
247    1
Name: LUNG_CANCER, dtype: int64


**Model Training**

*Logistic Regression*

In [84]:
model = LogisticRegression(solver='liblinear')

In [85]:
# training the LogisticRegression model with Training data
model.fit(X_train, Y_train)

*xgboost Classifier*

In [86]:
# training Xgb classifier
"""import xgboost as xgb
xgb_model=xgb.XGBClassifier().fit(X_train,Y_train)
y_pred=xgb_model.predict(X_test)

print('Test accuracy: ',accuracy_score(Y_test,y_pred))
print('Confusion matrix:\n',confusion_matrix(Y_test,y_pred))"""

"import xgboost as xgb\nxgb_model=xgb.XGBClassifier().fit(X_train,Y_train)\ny_pred=xgb_model.predict(X_test)\n\nprint('Test accuracy: ',accuracy_score(Y_test,y_pred))\nprint('Confusion matrix:\n',confusion_matrix(Y_test,y_pred))"

*Naive Bayes*

In [87]:
gnb = GaussianNB()

In [88]:
# training the Naive Bayes model with Training data
gnb.fit(X_train, Y_train)

*Decision Tree Classifier*

In [89]:
criterions = ['gini', 'entropy']
parameters = dict(criterion=criterions)
dtc = GridSearchCV(
    DecisionTreeClassifier(), parameters, cv=5, scoring='accuracy'
)
dtc.fit(X, Y.ravel())
dtc_opt = dtc.best_estimator_
print(dtc.best_params_)
print(dtc.best_score_)

{'criterion': 'entropy'}
0.8836065573770492


In [90]:
dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, Y_train.ravel())
dtc_pred = dtc.predict(X_test)
score = accuracy_score(dtc_pred, Y_test)
print(score)

0.9354838709677419


**Model Evaluation**

**Accuracy Score**

In [91]:
# accuracy on training data in Logistic Regression
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [92]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.935251798561151


In [93]:
# accuracy on test data in Logistic Regression
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [94]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9354838709677419


In [95]:
Results = pd.DataFrame({'Actual':Y_test,'Predictions':X_test_prediction})
Results.head(13)

Unnamed: 0,Actual,Predictions
158,1,1
285,1,1
264,1,1
276,1,1
76,1,1
66,1,1
65,1,1
46,1,1
259,1,1
159,0,1


In [96]:
# accuracy on training data in xgb
"""X_train_prediction_xgb = xgb_model.predict(X_train)
training_data_accuracy_xgb = accuracy_score(X_train_prediction_xgb, Y_train)"""

'X_train_prediction_xgb = xgb_model.predict(X_train)\ntraining_data_accuracy_xgb = accuracy_score(X_train_prediction_xgb, Y_train)'

In [97]:
#print('Accuracy on Training data : ', training_data_accuracy_xgb)

In [98]:
# accuracy on test data in xgb
#X_test_prediction_xgb = xgb_model.predict(X_test)
#test_data_accuracy_xgb = accuracy_score(X_test_prediction_xgb, Y_test)

In [99]:
#print('Accuracy on Test data : ', test_data_accuracy_xgb)

In [100]:
# accuracy on training data in Naive Bayes
X_tr_predict_nb = gnb.predict(X_train)
training_data_accuracy_nb = accuracy_score(X_tr_predict_nb, Y_train)

In [101]:
print('Accuracy on Training data : ', training_data_accuracy_nb)

Accuracy on Training data :  0.9064748201438849


In [102]:
# accuracy on test data in Naive Bayes
X_test_predict_nb = gnb.predict(X_test)
test_data_accuracy_nb = accuracy_score(X_test_predict_nb, Y_test)

In [103]:
print('Accuracy on Test data : ', test_data_accuracy_nb)

Accuracy on Test data :  0.9354838709677419


In [104]:
Results = pd.DataFrame({'Actual':Y_test,'Predictions':X_test_predict_nb})
Results.head(13)

Unnamed: 0,Actual,Predictions
158,1,1
285,1,1
264,1,1
276,1,1
76,1,1
66,1,1
65,1,1
46,1,1
259,1,1
159,0,1


In [105]:
# accuracy on training data in Decision Tree Classifier
X_train_prediction_dtc = dtc.predict(X_train)
training_data_accuracy_dtc = accuracy_score(X_train_prediction_dtc, Y_train)

In [106]:
print('Accuracy on Training data : ', training_data_accuracy_dtc)

Accuracy on Training data :  0.9964028776978417


In [107]:
# accuracy on testing data in Decision Tree Classifier
X_test_prediction_dtc = dtc.predict(X_test)
test_data_accuracy_dtc = accuracy_score(X_test_prediction_dtc, Y_test)

In [108]:
print('Accuracy on Test data : ', test_data_accuracy_dtc)

Accuracy on Test data :  0.9354838709677419


In [109]:
Results = pd.DataFrame({'Actual':Y_test,'Predictions':X_test_prediction_dtc})
Results.head(13)

Unnamed: 0,Actual,Predictions
158,1,1
285,1,1
264,1,1
276,1,1
76,1,1
66,1,1
65,1,1
46,1,1
259,1,1
159,0,0


So, we find that Decision Tree Classifier model has give the maximum accuracy on this dataset. We will choose DTC model for our predictive system.

Building a Predictive System

In [110]:
input_data = (1,63,1,2,1,1,1,1,1,2,1,2,2,1,1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)

prediction = dtc.predict(std_data)

if (prediction == 0):
  print('No possibility of Lung cancer')
else:
  print('hign possibility of Lung cancer')

No possibility of Lung cancer


**Saving The Trained Model**

In [111]:
import pickle

In [112]:
file = 'trained_model_Lung_cancer.sav'
pickle.dump(dtc, open(file, 'wb'))

In [113]:
#loading the saved model
loaded_model = pickle.load(open('trained_model_Lung_cancer.sav', 'rb'))

In [114]:
input_data = (1,63,1,2,1,1,1,1,1,2,1,2,2,1,1)

# change the input data to a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape the numpy array as we are predicting for only on instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)

prediction = loaded_model.predict(std_data)

if (prediction[0]== 0):
  print('No possibility of Lung cancer')
else:
  print('hign possibility of Lung cancer')

No possibility of Lung cancer
