# XGBoost baseline using only tabular data

In [66]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

#### Read all the tabular data from the participants and only keep the most relevant information

In [67]:
file_path = 'participant_d040722.csv'
df = pd.read_csv(file_path)

  df = pd.read_csv(file_path)


In [68]:
features = ['gender', 'age', 'pkyr', 'weight', 'age_quit', 'lung_cancer', 'cancyr', 'cigsmok', 'smokelive', 'smokework', 'smokeyr', 'smokeage', 'smokeday',
            #  'fambrother', 'famchild', 'famsister', 'famfather', 'fammother',
              'ageadas', 'ageasbe', 'agebron', 'agechas', 'agechro', 'agecopd', 'agediab', 'ageemph', 'agefibr', 'agehear', 'agehype', 'agepneu', 
              'agesarc', 'agesili', 'agestro', 'agetube'
             ]

df_selected = df[features].copy()

summary = df_selected.describe(include='all').transpose()
summary['missing_values'] = df_selected.isnull().sum()
summary['data_type'] = df_selected.dtypes

summary

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,missing_values,data_type
gender,53452.0,1.410125,0.491861,1.0,1.0,1.0,2.0,2.0,0,int64
age,53452.0,61.417926,5.023972,43.0,57.0,60.0,65.0,79.0,0,int64
pkyr,53452.0,55.974973,23.928785,15.0,39.0,48.0,66.0,567.6,0,float64
weight,53124.0,183.457289,39.404345,75.0,155.0,180.0,208.0,446.0,328,float64
age_quit,27528.0,54.706081,6.43489,11.0,50.0,55.0,59.0,74.0,25924,float64
lung_cancer,53452.0,0.038502,0.192406,0.0,0.0,0.0,0.0,1.0,0,int64
cancyr,2058.0,2.397473,1.970229,0.0,1.0,2.0,4.0,7.0,51394,float64
cigsmok,53452.0,0.481928,0.499678,0.0,0.0,0.0,1.0,1.0,0,int64
smokelive,53104.0,0.875301,0.33038,0.0,1.0,1.0,1.0,1.0,348,float64
smokework,52999.0,0.85979,0.347208,0.0,1.0,1.0,1.0,1.0,453,float64


In [69]:
# missing weight values are replaced with median
weight_imputer = SimpleImputer(strategy='median')
df_selected.loc[:, 'weight'] = weight_imputer.fit_transform(df_selected[['weight']])
df_selected.loc[:, 'age_quit'].fillna(-1, inplace=True)

# family = ['fambrother', 'famchild', 'famsister', 'famfather', 'fammother']

# for col in family:
#     df_selected.loc[:, col].fillna(0, inplace=True)

disease = ['ageadas', 'ageasbe', 'agebron', 'agechas', 'agechro', 'agecopd', 'agediab', 'ageemph', 'agefibr', 'agehear', 'agehype', 'agepneu', 'agesarc', 'agesili', 'agestro', 'agetube']

# missing disease values are replaced with 0, which indicates no disease
for col in disease:
    df_selected.loc[:, col].fillna(0, inplace=True)

smoke = ['smokelive', 'smokework', 'smokeage']

for col in smoke:
    df_selected.loc[:, col].fillna(0, inplace=True)

cancer_t3 = df['cancyr'].copy()
mask_t3 = cancer_t3 < 4
mask_t3_larger = cancer_t3 > 3
cancer_t3.loc[mask_t3] = 1
cancer_t3.loc[mask_t3_larger] = 0
cancer_negative = cancer_t3.isnull().sum()

df_selected.loc[:, 'cancyr'] = cancer_t3
df_selected = df_selected[df_selected['cancyr'] != 0]
df_selected.loc[:, 'cancyr'].fillna(0, inplace=True)

In [70]:
df_selected['cancyr'].value_counts()

cancyr
0.0    51394
1.0     1376
Name: count, dtype: int64

#### Resample Data such that the number of positive and negative cases is the same

In [71]:
t3 = False

if t3:

    positive_cancer = df_selected['cancyr'].value_counts()[1]
    negative_cases = df_selected[df_selected['cancyr'] == 0].sample(n=positive_cancer, random_state=1)
    positive_cases = df_selected[df_selected['cancyr'] == 1]

else:

    positive_cancer = df_selected['lung_cancer'].value_counts()[1]

    negative_cases = df_selected[df_selected['lung_cancer'] == 0].sample(n=positive_cancer, random_state=1)
    positive_cases = df_selected[df_selected['lung_cancer'] == 1]

df_selected = pd.concat([negative_cases, positive_cases], axis=0)

In [72]:
df_selected.describe(include='all').transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,2752.0,1.397892,0.489552,1.0,1.0,1.0,2.0,2.0
age,2752.0,62.53016,5.275662,43.0,58.0,62.0,66.0,75.0
pkyr,2752.0,60.075799,25.186398,29.25,42.0,52.5,74.0,232.0
weight,2752.0,179.889172,38.684057,78.0,153.0,178.0,203.0,390.0
age_quit,2752.0,25.647892,28.842556,-1.0,-1.0,-1.0,55.0,74.0
lung_cancer,2752.0,0.5,0.500091,0.0,0.0,0.5,1.0,1.0
cancyr,2752.0,0.5,0.500091,0.0,0.0,0.5,1.0,1.0
cigsmok,2752.0,0.530523,0.499158,0.0,0.0,1.0,1.0,1.0
smokelive,2752.0,0.873183,0.332828,0.0,1.0,1.0,1.0,1.0
smokework,2752.0,0.858285,0.348821,0.0,1.0,1.0,1.0,1.0


#### Training the XGBoost classifier

In [73]:
if t3:
    y = df_selected['cancyr']

else:
    y = df_selected['lung_cancer']

X = df_selected.drop(['lung_cancer', 'cancyr'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,gender,age,pkyr,weight,age_quit,cigsmok,smokelive,smokework,smokeyr,smokeage,...,agediab,ageemph,agefibr,agehear,agehype,agepneu,agesarc,agesili,agestro,agetube
8999,1,73,53.0,180.0,-1.0,1,1.0,0.0,53,20.0,...,0.0,0.0,0.0,53.0,0.0,7.0,0.0,0.0,0.0,0.0
16802,1,72,54.0,170.0,-1.0,1,1.0,1.0,54,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48329,2,57,30.0,101.0,-1.0,1,1.0,1.0,40,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32817,2,66,35.0,160.0,60.0,0,1.0,1.0,35,15.0,...,0.0,0.0,0.0,0.0,0.0,19.0,0.0,0.0,0.0,0.0
4193,1,71,55.0,218.0,-1.0,1,1.0,0.0,55,16.0,...,0.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

class_report = classification_report(y_test, y_pred)

In [75]:
print(class_report)

              precision    recall  f1-score   support

           0       0.63      0.57      0.60       283
           1       0.59      0.65      0.61       268

    accuracy                           0.61       551
   macro avg       0.61      0.61      0.61       551
weighted avg       0.61      0.61      0.61       551



In [76]:
print(accuracy)

0.6061705989110708
