In [1]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')


In [2]:
## load dataset
df = pd.read_csv('../Dataset/Lung_Cancer_Dataset_med.csv')

In [3]:
df.shape

(890000, 17)

In [4]:
df.sample(4)

Unnamed: 0.1,Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
41372,41373,55,Female,Ireland,10/18/2015,Stage I,Yes,Former Smoker,28.9,169,1,1,0,0,Radiation,12/26/2016,1
542108,542109,37,Female,Spain,6/3/2017,Stage III,No,Current Smoker,18.1,188,1,1,0,1,Chemotherapy,11/1/2018,0
625290,625291,42,Male,Germany,10/23/2023,Stage II,Yes,Current Smoker,35.3,286,0,0,0,0,Radiation,9/22/2025,0
769189,769190,44,Female,Slovenia,4/1/2016,Stage IV,Yes,Passive Smoker,30.5,266,1,0,0,0,Chemotherapy,10/8/2016,0


In [5]:
## check null value
df.isna().sum()

Unnamed: 0            0
age                   0
gender                0
country               0
diagnosis_date        0
cancer_stage          0
family_history        0
smoking_status        0
bmi                   0
cholesterol_level     0
hypertension          0
asthma                0
cirrhosis             0
other_cancer          0
treatment_type        0
end_treatment_date    0
survived              0
dtype: int64

In [6]:
# Fill missing numerics with median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())

# Fill missing categoricals with mode
df['smoking_status'] = df['smoking_status'].fillna(df['smoking_status'].mode()[0])


In [7]:
df.bmi

0         29.4
1         41.2
2         44.0
3         43.0
4         19.7
          ... 
889995    44.8
889996    21.6
889997    38.6
889998    18.6
889999    42.8
Name: bmi, Length: 890000, dtype: float64

In [8]:
df.smoking_status

0         Passive Smoker
1         Passive Smoker
2          Former Smoker
3         Passive Smoker
4         Passive Smoker
               ...      
889995    Passive Smoker
889996     Former Smoker
889997      Never Smoked
889998     Former Smoker
889999    Current Smoker
Name: smoking_status, Length: 890000, dtype: object

In [9]:
df

Unnamed: 0.1,Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64,Male,Sweden,4/5/2016,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,9/10/2017,0
1,2,50,Female,Netherlands,4/20/2023,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,6/17/2024,1
2,3,65,Female,Hungary,4/5/2023,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,4/9/2024,0
3,4,51,Female,Belgium,2/5/2016,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,4/23/2017,0
4,5,37,Male,Luxembourg,11/29/2023,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,1/8/2025,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,889996,40,Male,Malta,7/1/2022,Stage IV,No,Passive Smoker,44.8,243,1,1,1,0,Radiation,2/23/2023,0
889996,889997,62,Female,Cyprus,9/27/2015,Stage II,Yes,Former Smoker,21.6,240,0,0,0,0,Surgery,6/19/2017,0
889997,889998,48,Female,Estonia,3/27/2016,Stage III,Yes,Never Smoked,38.6,242,1,0,0,0,Combined,1/23/2017,1
889998,889999,67,Female,Slovakia,12/22/2015,Stage IV,Yes,Former Smoker,18.6,194,1,1,0,0,Chemotherapy,12/12/2017,0


In [10]:
# Drop Unnamed column
df.drop(columns=['Unnamed: 0'], inplace=True)

In [11]:
## Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
label_cols = df_encoded.select_dtypes(include='object').columns

label_encoders = {}
for col in label_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le


In [12]:
df_encoded

Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,64,1,26,2074,0,1,3,29.4,199,0,0,1,0,0,3877,0
1,50,0,19,1951,2,1,3,41.2,280,1,1,0,0,3,2949,1
2,65,0,12,2081,2,1,1,44.0,268,1,1,0,0,1,2480,0
3,51,0,1,1464,0,0,3,43.0,241,1,1,0,0,0,2305,0
4,37,1,17,839,0,0,3,19.7,178,0,0,0,0,1,358,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,40,1,18,2739,3,0,3,44.8,243,1,1,1,0,2,1600,0
889996,62,0,4,3542,1,1,1,21.6,240,0,0,0,0,3,2964,0
889997,48,0,7,1704,2,1,2,38.6,242,1,0,0,0,1,182,1
889998,67,0,23,1061,3,1,1,18.6,194,1,1,0,0,0,1081,0


In [13]:
# ## Handle Outliers 
# # Remove outliers in BMI > 60
# df_encoded = df_encoded[df_encoded['bmi'] < 60]

# # Or clip cholesterol to 1st-99th percentile
# df_encoded['cholesterol_level'] = df_encoded['cholesterol_level'].clip(lower=df_encoded['cholesterol_level'].quantile(0.01),
#                                                                        upper=df_encoded['cholesterol_level'].quantile(0.99))


In [14]:
## Feature Engineering
df_encoded['diagnosis_date'] = pd.to_datetime(df_encoded['diagnosis_date'])
df_encoded['end_treatment_date'] = pd.to_datetime(df_encoded['end_treatment_date'])

df_encoded['treatment_duration_days'] = (df_encoded['end_treatment_date'] - df_encoded['diagnosis_date']).dt.days

# Drop original date columns
df_encoded.drop(columns=['diagnosis_date', 'end_treatment_date'], inplace=True)


In [15]:
df_encoded

Unnamed: 0,age,gender,country,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration_days
0,64,1,26,0,1,3,29.4,199,0,0,1,0,0,0,0
1,50,0,19,2,1,3,41.2,280,1,1,0,0,3,1,0
2,65,0,12,2,1,1,44.0,268,1,1,0,0,1,0,0
3,51,0,1,0,0,3,43.0,241,1,1,0,0,0,0,0
4,37,1,17,0,0,3,19.7,178,0,0,0,0,1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,40,1,18,3,0,3,44.8,243,1,1,1,0,2,0,-1
889996,62,0,4,1,1,1,21.6,240,0,0,0,0,3,0,-1
889997,48,0,7,2,1,2,38.6,242,1,0,0,0,1,1,-1
889998,67,0,23,3,1,1,18.6,194,1,1,0,0,0,0,0


In [16]:
## save df_encoded for ml
df_encoded.to_csv('df_encoded.csv', index=False)


In [17]:
df_encoded

Unnamed: 0,age,gender,country,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,survived,treatment_duration_days
0,64,1,26,0,1,3,29.4,199,0,0,1,0,0,0,0
1,50,0,19,2,1,3,41.2,280,1,1,0,0,3,1,0
2,65,0,12,2,1,1,44.0,268,1,1,0,0,1,0,0
3,51,0,1,0,0,3,43.0,241,1,1,0,0,0,0,0
4,37,1,17,0,0,3,19.7,178,0,0,0,0,1,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
889995,40,1,18,3,0,3,44.8,243,1,1,1,0,2,0,-1
889996,62,0,4,1,1,1,21.6,240,0,0,0,0,3,0,-1
889997,48,0,7,2,1,2,38.6,242,1,0,0,0,1,1,-1
889998,67,0,23,3,1,1,18.6,194,1,1,0,0,0,0,0
