#Cancer Detaction Model

In [6]:
import numpy as np
import pandas as pd

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [8]:
dataframe = pd.read_csv('survey lung cancer.csv')

In [9]:
dataframe.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


In [10]:
dataframe['LUNG_CANCER'].replace({'YES': 1, 'NO': 0}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataframe['LUNG_CANCER'].replace({'YES': 1, 'NO': 0}, inplace=True)
  dataframe['LUNG_CANCER'].replace({'YES': 1, 'NO': 0}, inplace=True)


In [11]:
y=dataframe['LUNG_CANCER']

In [12]:
y

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

In [13]:
dataframe.isnull().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [14]:
x=dataframe.iloc[:,:-1]

In [15]:
y

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

In [16]:
print(y)

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64


In [17]:
y

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

In [18]:
x

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304,F,56,1,1,1,2,2,2,1,1,2,2,2,2,1
305,M,70,2,1,1,1,1,2,2,2,2,2,2,1,2
306,M,58,2,1,1,1,1,1,2,2,2,2,1,1,2
307,M,67,2,1,2,1,1,2,2,1,2,2,2,1,2


In [19]:
y.head()

0    1
1    1
2    0
3    0
4    0
Name: LUNG_CANCER, dtype: int64

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=43)

In [21]:
y_train

191    1
17     1
36     1
233    1
141    1
      ..
16     1
58     1
277    0
305    1
255    1
Name: LUNG_CANCER, Length: 247, dtype: int64

In [22]:
categorical_features=['GENDER']

In [23]:
categorical_transformer=Pipeline(
    steps=[('One_Hot_Encoder',OneHotEncoder(handle_unknown='ignore'))]
)

In [24]:
preprocessing = ColumnTransformer(
    transformers=[('cat',categorical_transformer,
                   categorical_features)])

In [25]:
tf1 = ColumnTransformer([('scale',MinMaxScaler(),slice(0,18))])

In [26]:
tf2=SelectKBest(score_func=chi2,k='all')

In [29]:
tf3=LogisticRegression()

In [30]:
pipe=Pipeline([('preprocessing',preprocessing),
               ('tf1',tf1),
               ('tf2',tf2),
               ('tf3',tf3)
               ])

In [31]:
pipe.fit(x_train,y_train)

In [32]:
y_pred=pipe.predict(x_test)

In [33]:
print(y_pred)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [35]:
accuracy_score(y_test,y_pred)

0.8709677419354839

In [36]:
import pickle

In [37]:
pickle.dump(pipe,open('pipe.pkl','wb'))

In [38]:
pipe = pickle.load(open('pipe.pkl','rb'))

In [42]:
input_2=pd.DataFrame([['F',59,1,1,1,2,1,2,1,2,1,2,2,1,2]],columns=['GENDER','AGE','SMOKING',
                                                                 'YELLOW_FINGERS','ANXIETY','PEER_PRESSURE','CHRONIC DISEASE'
                                                                 ,'FATIGUE','ALLERGY','WHEEZING','ALCOHOL CONSUMING','COUGHING'
                                                                 ,'SHORTNESS OF BREATH','SWALLOWING DIFFICULTY','CHEST PAIN'])

In [44]:
pipe.predict(input_2)

array([1], dtype=int64)