In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitions/playground-series-s6e2/sample_submission.csv
/kaggle/input/competitions/playground-series-s6e2/train.csv
/kaggle/input/competitions/playground-series-s6e2/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/competitions/playground-series-s6e2/train.csv')
test = pd.read_csv('/kaggle/input/competitions/playground-series-s6e2/test.csv')

In [3]:
train.head()

Unnamed: 0,id,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,0,58,1,4,152,239,0,0,158,1,3.6,2,2,7,Presence
1,1,52,1,1,125,325,0,2,171,0,0.0,1,0,3,Absence
2,2,56,0,2,160,188,0,2,151,0,0.0,1,0,3,Absence
3,3,44,0,3,134,229,0,2,150,0,1.0,2,0,3,Absence
4,4,58,1,4,140,234,0,2,125,1,3.8,2,3,3,Presence


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630000 entries, 0 to 629999
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       630000 non-null  int64  
 1   Age                      630000 non-null  int64  
 2   Sex                      630000 non-null  int64  
 3   Chest pain type          630000 non-null  int64  
 4   BP                       630000 non-null  int64  
 5   Cholesterol              630000 non-null  int64  
 6   FBS over 120             630000 non-null  int64  
 7   EKG results              630000 non-null  int64  
 8   Max HR                   630000 non-null  int64  
 9   Exercise angina          630000 non-null  int64  
 10  ST depression            630000 non-null  float64
 11  Slope of ST              630000 non-null  int64  
 12  Number of vessels fluro  630000 non-null  int64  
 13  Thallium                 630000 non-null  int64  
 14  Hear

In [5]:
train['Heart Disease'] = train['Heart Disease'].map({
    "Absence" : 0,
    "Presence" : 1
})

In [6]:
df = train.copy()
df1 = test.copy()

In [7]:

# Feature Engineering
df["Age_BP"] = df["Age"] * df["BP"]
df["Age_Chol"] = df["Age"] * df["Cholesterol"]
df["Chol_BP"] = df["Cholesterol"] * df["BP"]
df["ST_Exercise"] = df["ST depression"] * df["Exercise angina"]
df["HR_Exercise"] = df["Max HR"] * df["Exercise angina"]

## Ratio Features
df["Chol_Age_Ratio"] = df["Cholesterol"] / (df["Age"] + 1)
df["BP_Age_Ratio"] = df["BP"] / (df["Age"] + 1)
df["HR_Age_Ratio"] = df["Max HR"] / (df["Age"] + 1)

# Binning
df["Age_Group"] = pd.cut(df["Age"],
                         bins=[0, 40, 55, 65, 100],
                         labels=[0,1,2,3]).astype(int)

df["Chol_Level"] = pd.cut(df["Cholesterol"],
                          bins=[0, 200, 240, 600],
                          labels=[0,1,2]).astype(int)


# Risk Score

df["Risk_Score"] = (
    df["Age"] * 0.03 +
    df["Cholesterol"] * 0.002 +
    df["BP"] * 0.01 +
    df["ST depression"] * 2 +
    df["Exercise angina"] * 5
)


In [8]:
# Feature Engineering
df1["Age_BP"] = df1["Age"] * df1["BP"]
df1["Age_Chol"] = df1["Age"] * df1["Cholesterol"]
df1["Chol_BP"] = df1["Cholesterol"] * df1["BP"]
df1["ST_Exercise"] = df1["ST depression"] * df1["Exercise angina"]
df1["HR_Exercise"] = df1["Max HR"] * df1["Exercise angina"]

# Ratio Features
df1["Chol_Age_Ratio"] = df1["Cholesterol"] / (df1["Age"] + 1)
df1["BP_Age_Ratio"] = df1["BP"] / (df1["Age"] + 1)
df1["HR_Age_Ratio"] = df1["Max HR"] / (df1["Age"] + 1)

# Binning
df1["Age_Group"] = pd.cut(
    df1["Age"],
    bins=[0, 40, 55, 65, 100],
    labels=[0, 1, 2, 3]
).astype(int)

df1["Chol_Level"] = pd.cut(
    df1["Cholesterol"],
    bins=[0, 200, 240, 600],
    labels=[0, 1, 2]
).astype(int)

# Risk Score
df1["Risk_Score"] = (
    df1["Age"] * 0.03 +
    df1["Cholesterol"] * 0.002 +
    df1["BP"] * 0.01 +
    df1["ST depression"] * 2 +
    df1["Exercise angina"] * 5
)


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,StandardScaler

In [10]:
num_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy = 'median')),
    ('scaler',StandardScaler())
])

cat_pipeline = Pipeline([
    ('ordinal_encoder',OrdinalEncoder()),
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('cat_encoder',OneHotEncoder(sparse_output = False))
])

In [11]:
df.columns

Index(['id', 'Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol',
       'FBS over 120', 'EKG results', 'Max HR', 'Exercise angina',
       'ST depression', 'Slope of ST', 'Number of vessels fluro', 'Thallium',
       'Heart Disease', 'Age_BP', 'Age_Chol', 'Chol_BP', 'ST_Exercise',
       'HR_Exercise', 'Chol_Age_Ratio', 'BP_Age_Ratio', 'HR_Age_Ratio',
       'Age_Group', 'Chol_Level', 'Risk_Score'],
      dtype='object')

In [12]:
for col in df.columns:
    if df[col].dtype == bool:
        df[col] = df[col].astype(int)
        df1[col] = df1[col].astype(int)

In [13]:
def separateColumn(df,include_bool = False):
    if include_bool:
        num_cols = df.select_dtypes(include = ['int64','float64','bool']).columns.tolist()
        cat_cols = df.select_dtypes(exclude = ['int64','float64','bool']).columns.tolist()
    else:
        num_cols = df.select_dtypes(include = ['int64','float64']).columns.tolist()
        cat_cols = df.select_dtypes(exclude = ['int64','float64']).columns.tolist()
    return num_cols,cat_cols
        

In [14]:
from sklearn.compose import ColumnTransformer

In [15]:
X = train.drop('Heart Disease', axis=1)

num_attrib, cat_attrib = separateColumn(X)
del num_attrib[-1]
preprocessPipeline = ColumnTransformer([
    ('num',num_pipeline,num_attrib),
    ('cat',cat_pipeline,cat_attrib)
])

In [16]:
df.columns.tolist()

['id',
 'Age',
 'Sex',
 'Chest pain type',
 'BP',
 'Cholesterol',
 'FBS over 120',
 'EKG results',
 'Max HR',
 'Exercise angina',
 'ST depression',
 'Slope of ST',
 'Number of vessels fluro',
 'Thallium',
 'Heart Disease',
 'Age_BP',
 'Age_Chol',
 'Chol_BP',
 'ST_Exercise',
 'HR_Exercise',
 'Chol_Age_Ratio',
 'BP_Age_Ratio',
 'HR_Age_Ratio',
 'Age_Group',
 'Chol_Level',
 'Risk_Score']

In [17]:
X_train = preprocessPipeline.fit_transform(df.drop('Heart Disease',axis=1))

In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(df['Heart Disease'])


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier

In [20]:
Xtrain,xtest,ytrain,ytest = train_test_split(X_train,y_train,test_size = 0.2,random_state=2026)

In [21]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    use_label_encoder=False,
    early_stopping_rounds=20, 
    eval_metric='logloss'
)

In [22]:
model.fit(
    Xtrain, ytrain,
    eval_set=[(xtest, ytest)],
    verbose=True
)

[0]	validation_0-logloss:0.66562
[1]	validation_0-logloss:0.64433
[2]	validation_0-logloss:0.62368
[3]	validation_0-logloss:0.60482
[4]	validation_0-logloss:0.58773
[5]	validation_0-logloss:0.57187


Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


[6]	validation_0-logloss:0.55737
[7]	validation_0-logloss:0.54382
[8]	validation_0-logloss:0.53117
[9]	validation_0-logloss:0.51931
[10]	validation_0-logloss:0.50831
[11]	validation_0-logloss:0.49809
[12]	validation_0-logloss:0.48863
[13]	validation_0-logloss:0.47975
[14]	validation_0-logloss:0.47142
[15]	validation_0-logloss:0.46384
[16]	validation_0-logloss:0.45646
[17]	validation_0-logloss:0.44943
[18]	validation_0-logloss:0.44296
[19]	validation_0-logloss:0.43689
[20]	validation_0-logloss:0.43110
[21]	validation_0-logloss:0.42560
[22]	validation_0-logloss:0.42041
[23]	validation_0-logloss:0.41558
[24]	validation_0-logloss:0.41096
[25]	validation_0-logloss:0.40669
[26]	validation_0-logloss:0.40256
[27]	validation_0-logloss:0.39873
[28]	validation_0-logloss:0.39507
[29]	validation_0-logloss:0.39156
[30]	validation_0-logloss:0.38825
[31]	validation_0-logloss:0.38500
[32]	validation_0-logloss:0.38193
[33]	validation_0-logloss:0.37905
[34]	validation_0-logloss:0.37625
[35]	validation_0-

In [23]:
X_test = preprocessPipeline.fit_transform(df1)

In [24]:
y_pred2 = model.predict_proba(X_test)

In [25]:
len(y_pred2)

270000

In [26]:
len(test['id'])

270000

In [27]:
submission = pd.DataFrame({
    'id': test['id'],
    'Heart Disease': y_pred2[:,1] 
})
submission.to_csv('submission.csv', index=False)