# Cancer Detection

### Import dependencies

In [96]:
from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [97]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

### Data Loading

In [98]:
data_df = pd.read_csv('data.csv')
data_df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,1,2,1,2,2,1,1,NO


The effectiveness of cancer prediction system helps the people to know their cancer risk with low cost and it also helps the people to take the appropriate decision based on their cancer risk status. The data is collected from the website online lung cancer prediction system .

**Attribute information:**

---



1. **Gender:** M(male), F(female)
2. **Age:** Age of the patient
3. **Smoking:** YES=2 , NO=1.
4. **Yellow fingers:** YES=2 , NO=1.
5. **Anxiety:** YES=2 , NO=1.
6. **Peer_pressure:** YES=2 , NO=1.
7. **Chronic Disease:** YES=2 , NO=1.
8. **Fatigue:** YES=2 , NO=1.
9. **Allergy:** YES=2 , NO=1.
10. **Wheezing:** YES=2 , NO=1.
11. **Alcohol:** YES=2 , NO=1.
12. **Coughing:** YES=2 , NO=1.
13. **Shortness of Breath:** YES=2 , NO=1.
14. **Swallowing Difficulty:** YES=2 , NO=1.
15. **Chest pain:** YES=2 , NO=1.
16. **Lung Cancer:** YES , NO.

In [99]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309 entries, 0 to 308
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   GENDER                 309 non-null    object
 1   AGE                    309 non-null    int64 
 2   SMOKING                309 non-null    int64 
 3   YELLOW_FINGERS         309 non-null    int64 
 4   ANXIETY                309 non-null    int64 
 5   PEER_PRESSURE          309 non-null    int64 
 6   CHRONIC DISEASE        309 non-null    int64 
 7   FATIGUE                309 non-null    int64 
 8   ALLERGY                309 non-null    int64 
 9   WHEEZING               309 non-null    int64 
 10  ALCOHOL CONSUMING      309 non-null    int64 
 11  COUGHING               309 non-null    int64 
 12  SHORTNESS OF BREATH    309 non-null    int64 
 13  SWALLOWING DIFFICULTY  309 non-null    int64 
 14  CHEST PAIN             309 non-null    int64 
 15  LUNG_CANCER            

In [100]:
columns_names = data_df.columns
print(columns_names)

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY',
       'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'LUNG_CANCER'],
      dtype='object')


In [101]:
data_df.describe()

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
count,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0,309.0
mean,62.673139,1.563107,1.569579,1.498382,1.501618,1.504854,1.673139,1.556634,1.556634,1.556634,1.579288,1.640777,1.469256,1.556634
std,8.210301,0.496806,0.495938,0.500808,0.500808,0.500787,0.469827,0.497588,0.497588,0.497588,0.494474,0.480551,0.499863,0.497588
min,21.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,57.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,62.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,2.0
75%,69.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
max,87.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


### Data Transformation

In [102]:
data_df.rename(columns={"FATIGUE ": "FATIGUE"}, inplace=True)

In [103]:
data_df.rename(columns={"ALLERGY ": "ALLERGY"}, inplace=True)

In [104]:
data_df.drop(columns=["ANXIETY"],  inplace=True)
data_df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,M,69,1,2,1,1,2,1,2,2,2,2,2,2,YES
1,M,74,2,1,1,2,2,2,1,1,1,2,2,2,YES
2,F,59,1,1,2,1,2,1,2,1,2,2,1,2,NO
3,M,63,2,2,1,1,1,1,1,2,1,1,2,2,NO
4,F,63,1,2,1,1,1,1,2,1,2,2,1,1,NO


In [105]:
# rename LUNG_CANCER to target column with yes to 1 and no to 0
data_df.rename(columns={"LUNG_CANCER": "target"}, inplace=True)
data_df["target"] = data_df["target"].map({"YES": 1, "NO": 0})
data_df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,M,69,1,2,1,1,2,1,2,2,2,2,2,2,1
1,M,74,2,1,1,2,2,2,1,1,1,2,2,2,1
2,F,59,1,1,2,1,2,1,2,1,2,2,1,2,0
3,M,63,2,2,1,1,1,1,1,2,1,1,2,2,0
4,F,63,1,2,1,1,1,1,2,1,2,2,1,1,0


In [106]:
# converting gender male to 1 and female to 0
data_df["GENDER"] = data_df["GENDER"].map({"M": 1, "F": 0})
data_df["SMOKING"] = data_df["SMOKING"].map({1: 0, 2: 1})
data_df["YELLOW_FINGERS"] = data_df["YELLOW_FINGERS"].map({1: 0, 2: 1})
data_df["PEER_PRESSURE"] = data_df["PEER_PRESSURE"].map({1: 0, 2: 1})

data_df["CHRONIC DISEASE"] = data_df["CHRONIC DISEASE"].map({1: 0, 2: 1})
data_df["FATIGUE"] = data_df["FATIGUE"].map({1: 0, 2: 1})
data_df["ALLERGY"] = data_df["ALLERGY"].map({1: 0, 2: 1})
data_df["WHEEZING"] = data_df["WHEEZING"].map({1: 0, 2: 1})

data_df["ALCOHOL CONSUMING"] = data_df["ALCOHOL CONSUMING"].map({1: 0, 2: 1})
data_df["COUGHING"] = data_df["COUGHING"].map({1: 0, 2: 1})
data_df["SHORTNESS OF BREATH"] = data_df["SHORTNESS OF BREATH"].map({
                                                                    1: 0, 2: 1})
data_df["SWALLOWING DIFFICULTY"] = data_df["SWALLOWING DIFFICULTY"].map({
                                                                        1: 0, 2: 1})
data_df["CHEST PAIN"] = data_df["CHEST PAIN"].map({1: 0, 2: 1})
data_df.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,1,69,0,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,1,0,1,1,0,0,0


### Data Modelling

In [107]:
x = data_df.drop(columns=['target'])
y = data_df['target']

In [108]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

In [110]:
xgb_classifier = XGBClassifier()
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

In [111]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9838709677419355


In [112]:
# classification_report
print("classification_report")
print(classification_report(y_test, y_pred))

classification_report
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.98      1.00      0.99        60

    accuracy                           0.98        62
   macro avg       0.99      0.75      0.83        62
weighted avg       0.98      0.98      0.98        62



In [113]:
data_df.columns

Index(['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'PEER_PRESSURE',
       'CHRONIC DISEASE', 'FATIGUE', 'ALLERGY', 'WHEEZING',
       'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH',
       'SWALLOWING DIFFICULTY', 'CHEST PAIN', 'target'],
      dtype='object')

In [114]:
# make new csv file with the new data
data_df.to_csv('new_data.csv', index=False)
print("new_data.csv created")

new_data.csv created


In [115]:
data= pd.read_csv('new_data.csv')
data.head()

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,target
0,1,69,0,1,0,0,1,0,1,1,1,1,1,1,1
1,1,74,1,0,0,1,1,1,0,0,0,1,1,1,1
2,0,59,0,0,1,0,1,0,1,0,1,1,0,1,0
3,1,63,1,1,0,0,0,0,0,1,0,0,1,1,0
4,0,63,0,1,0,0,0,0,1,0,1,1,0,0,0


In [116]:
# showing number of unique values in each column with values what it is
for column in data.columns:
    print(column, data[column].unique())

GENDER [1 0]
AGE [69 74 59 63 75 52 51 68 53 61 72 60 58 48 57 44 64 21 65 55 62 56 67 77
 70 54 49 73 47 71 66 76 78 81 79 38 39 87 46]
SMOKING [0 1]
YELLOW_FINGERS [1 0]
PEER_PRESSURE [0 1]
CHRONIC DISEASE [0 1]
FATIGUE [1 0]
ALLERGY [0 1]
WHEEZING [1 0]
ALCOHOL CONSUMING [1 0]
COUGHING [1 0]
SHORTNESS OF BREATH [1 0]
SWALLOWING DIFFICULTY [1 0]
CHEST PAIN [1 0]
target [1 0]


In [117]:
x = data.drop(columns=['target'])
y = data['target']

In [118]:
model = XGBClassifier()
model.fit(x, y)
print("model created")

model created


In [119]:
def predict_new_data(data_of_new_patient):
    data_of_new_patient = np.array(data_of_new_patient).reshape(1, -1)
    prediction = model.predict(data_of_new_patient)
    return prediction[0]

In [120]:
# Example dummy data
data_of_new_patient = [1, 21, 2, 2, 1, 2, 2, 0, 2, 1, 0, 2, 1, 1]

# Call the predict_new_data function
prediction = predict_new_data(data_of_new_patient)

# Print the prediction
print("Prediction:", prediction)

Prediction: 1
