In [2]:
import pandas as pd
import kagglehub
import os
from sklearn.preprocessing import OneHotEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path = kagglehub.dataset_download("khwaishsaxena/lung-cancer-dataset")
print (path)
path = os.path.join(path, "Lung Cancer.csv")  # Replace with the actual CSV filename
df = pd.read_csv(path)

/home/bruno/.cache/kagglehub/datasets/khwaishsaxena/lung-cancer-dataset/versions/1


In [4]:
df.shape

(890000, 17)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890000 entries, 0 to 889999
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  890000 non-null  int64  
 1   age                 890000 non-null  float64
 2   gender              890000 non-null  object 
 3   country             890000 non-null  object 
 4   diagnosis_date      890000 non-null  object 
 5   cancer_stage        890000 non-null  object 
 6   family_history      890000 non-null  object 
 7   smoking_status      890000 non-null  object 
 8   bmi                 890000 non-null  float64
 9   cholesterol_level   890000 non-null  int64  
 10  hypertension        890000 non-null  int64  
 11  asthma              890000 non-null  int64  
 12  cirrhosis           890000 non-null  int64  
 13  other_cancer        890000 non-null  int64  
 14  treatment_type      890000 non-null  object 
 15  end_treatment_date  890000 non-nul

In [6]:
df.head()

Unnamed: 0,id,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,1,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,2,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,3,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,4,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,5,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [7]:
df = df.drop("id", axis=1)
df.head()

Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,64.0,Male,Sweden,2016-04-05,Stage I,Yes,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,50.0,Female,Netherlands,2023-04-20,Stage III,Yes,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,65.0,Female,Hungary,2023-04-05,Stage III,Yes,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,51.0,Female,Belgium,2016-02-05,Stage I,No,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,37.0,Male,Luxembourg,2023-11-29,Stage I,No,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [8]:
df['family_history'] = df['family_history'].map({'Yes': 1, 'No': 0})
df.head()

Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,64.0,Male,Sweden,2016-04-05,Stage I,1,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,50.0,Female,Netherlands,2023-04-20,Stage III,1,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,65.0,Female,Hungary,2023-04-05,Stage III,1,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,51.0,Female,Belgium,2016-02-05,Stage I,0,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,37.0,Male,Luxembourg,2023-11-29,Stage I,0,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [9]:
stage_mapping = {'Stage I': 1, 'Stage II': 2, 'Stage III': 3, 'Stage IV': 4}
df['cancer_stage'] = df['cancer_stage'].map(stage_mapping)
df.head()

Unnamed: 0,age,gender,country,diagnosis_date,cancer_stage,family_history,smoking_status,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,treatment_type,end_treatment_date,survived
0,64.0,Male,Sweden,2016-04-05,1,1,Passive Smoker,29.4,199,0,0,1,0,Chemotherapy,2017-09-10,0
1,50.0,Female,Netherlands,2023-04-20,3,1,Passive Smoker,41.2,280,1,1,0,0,Surgery,2024-06-17,1
2,65.0,Female,Hungary,2023-04-05,3,1,Former Smoker,44.0,268,1,1,0,0,Combined,2024-04-09,0
3,51.0,Female,Belgium,2016-02-05,1,0,Passive Smoker,43.0,241,1,1,0,0,Chemotherapy,2017-04-23,0
4,37.0,Male,Luxembourg,2023-11-29,1,0,Passive Smoker,19.7,178,0,0,0,0,Combined,2025-01-08,0


In [10]:
encoder = OneHotEncoder(drop='first', sparse_output=False)
columns_to_encode = ["gender", "country", "smoking_status", "treatment_type"]
encoded = encoder.fit_transform(df[columns_to_encode])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(columns_to_encode), index=df.index)
df = pd.concat([df.drop(columns_to_encode, axis=1), encoded_df], axis=1)
df.head()


Unnamed: 0,age,diagnosis_date,cancer_stage,family_history,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,...,country_Slovakia,country_Slovenia,country_Spain,country_Sweden,smoking_status_Former Smoker,smoking_status_Never Smoked,smoking_status_Passive Smoker,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
0,64.0,2016-04-05,1,1,29.4,199,0,0,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50.0,2023-04-20,3,1,41.2,280,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,65.0,2023-04-05,3,1,44.0,268,1,1,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,51.0,2016-02-05,1,0,43.0,241,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,37.0,2023-11-29,1,0,19.7,178,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [11]:
df.drop(columns=['diagnosis_date', 'end_treatment_date'], inplace=True)
df.head()

Unnamed: 0,age,cancer_stage,family_history,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,survived,...,country_Slovakia,country_Slovenia,country_Spain,country_Sweden,smoking_status_Former Smoker,smoking_status_Never Smoked,smoking_status_Passive Smoker,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
0,64.0,1,1,29.4,199,0,0,1,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,50.0,3,1,41.2,280,1,1,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,65.0,3,1,44.0,268,1,1,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,51.0,1,0,43.0,241,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,37.0,1,0,19.7,178,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [12]:
df.describe()


Unnamed: 0,age,cancer_stage,family_history,bmi,cholesterol_level,hypertension,asthma,cirrhosis,other_cancer,survived,...,country_Slovakia,country_Slovenia,country_Spain,country_Sweden,smoking_status_Former Smoker,smoking_status_Never Smoked,smoking_status_Passive Smoker,treatment_type_Combined,treatment_type_Radiation,treatment_type_Surgery
count,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,...,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0,890000.0
mean,55.007008,2.500148,0.499797,30.494172,233.633916,0.750024,0.46974,0.225956,0.088157,0.220229,...,0.036913,0.037084,0.037126,0.03726,0.249642,0.250282,0.250753,0.250122,0.248166,0.250855
std,9.994485,1.118078,0.5,8.368539,43.432278,0.432999,0.499084,0.418211,0.283524,0.414401,...,0.18855,0.188968,0.18907,0.189397,0.432806,0.433176,0.433447,0.433084,0.431949,0.433505
min,4.0,1.0,0.0,16.0,150.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,1.0,0.0,23.3,196.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,55.0,3.0,0.0,30.5,242.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,62.0,4.0,1.0,37.7,271.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
max,104.0,4.0,1.0,45.0,300.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [13]:
df.isnull().sum()

age                              0
cancer_stage                     0
family_history                   0
bmi                              0
cholesterol_level                0
hypertension                     0
asthma                           0
cirrhosis                        0
other_cancer                     0
survived                         0
gender_Male                      0
country_Belgium                  0
country_Bulgaria                 0
country_Croatia                  0
country_Cyprus                   0
country_Czech Republic           0
country_Denmark                  0
country_Estonia                  0
country_Finland                  0
country_France                   0
country_Germany                  0
country_Greece                   0
country_Hungary                  0
country_Ireland                  0
country_Italy                    0
country_Latvia                   0
country_Lithuania                0
country_Luxembourg               0
country_Malta       

In [14]:
df.duplicated().sum()

np.int64(8)

In [15]:
df.to_csv("Cleaned_Lung_Cancer.csv")