In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

In [2]:
lung_cancer_data = pd.read_csv('survey_lung_cancer.csv')

In [3]:
lung_cancer_data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC_DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL_CONSUMING,COUGHING,SHORTNESS_OF_BREATH,SWALLOWING_DIFFICULTY,CHEST_PAIN,LUNG_CANCER
0,1,69,1,2,2,1,1,2,1,2,2,2,2,2,2,2
1,1,74,2,1,1,1,2,2,2,1,1,1,2,2,2,2
2,2,59,1,1,1,2,1,2,1,2,1,2,2,1,2,1
3,1,63,2,2,2,1,1,1,1,1,2,1,1,2,2,1
4,2,63,1,2,1,1,1,1,1,2,1,2,2,1,1,1


In [4]:
lung_cancer_data['LUNG_CANCER'] = lung_cancer_data['LUNG_CANCER'].replace(1, 'No Cancer')
lung_cancer_data['LUNG_CANCER'] = lung_cancer_data['LUNG_CANCER'].replace(2, 'Cancer')
fig = px.pie(lung_cancer_data, names='LUNG_CANCER', title='Pie Chart of LUNG CANCER', labels=['No Cancer', 'Cancer'], color_discrete_sequence=['purple', 'pink'])
fig.show()

In [5]:
X = lung_cancer_data.drop(columns='LUNG_CANCER' , axis=1)
Y = lung_cancer_data['LUNG_CANCER']

In [6]:
print(X)

     GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0         1   69        1               2        2              1   
1         1   74        2               1        1              1   
2         2   59        1               1        1              2   
3         1   63        2               2        2              1   
4         2   63        1               2        1              1   
..      ...  ...      ...             ...      ...            ...   
304       2   56        1               1        1              2   
305       1   70        2               1        1              1   
306       1   58        2               1        1              1   
307       1   67        2               1        2              1   
308       1   62        1               1        1              2   

     CHRONIC_DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL_CONSUMING  \
0                  1         2         1         2                  2   
1                  2     

In [7]:
print(Y)

0         Cancer
1         Cancer
2      No Cancer
3      No Cancer
4      No Cancer
         ...    
304       Cancer
305       Cancer
306       Cancer
307       Cancer
308       Cancer
Name: LUNG_CANCER, Length: 309, dtype: object


In [8]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [9]:
print(X.shape, X_train.shape, X_test.shape)

(309, 15) (247, 15) (62, 15)


In [10]:
model = LogisticRegression()

In [11]:
model.fit(X_train, Y_train)

In [12]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [13]:
print ('Training data accuracy :', training_data_accuracy)

Training data accuracy : 0.9554655870445344


In [14]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [15]:
print('Testing Data accuracy :' , test_data_accuracy)

Testing Data accuracy : 0.8548387096774194


In [16]:
input_data = (1,69,1,2,2,1,1,2,1,2,2,2,2,2,2) 
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==2):
       print('Patient Not Affected By Lung Cancer')
else: 
    print('Patient Affected By Lung Cancer')


['Cancer']
Patient Affected By Lung Cancer


In [17]:
input_data = (2,68,2,1,2,1,1,2,1,1,1,1,1,1,1) 
input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==2):
       print(' Patient Not Affected By Lung Cancer')
else: 
    print(' Patient Affected By Lung Cancer')


['No Cancer']
 Patient Affected By Lung Cancer


In [18]:
import pickle

In [19]:
filename = 'lung_cancer.sav'
pickle.dump(model, open(filename, 'wb'))