In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import Normalizer
from catboost import CatBoostClassifier
import random
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Function

In [2]:
def read_all_file_paths(root_directory='kaggle/input'):
    data_dict = {}
    for root, _, files in os.walk(root_directory):
        for filename in files:
            extracted_text = re.match(r'^([^.]*)\.', filename)
            if extracted_text:
                data_dict[extracted_text.group(1)] = os.path.join(root, filename)
    return data_dict


In [3]:
def preprocess_min_max(data):
    scaler = MinMaxScaler()
    data_scaled = scaler.fit_transform(data)
    return data_scaled
    

In [4]:
def preprocess_normalize(data):
    scaler = Normalizer()
    data_scaled = scaler.fit_transform(data)
    return data_scaled
    

## Feature engineering function

In [6]:
def categorize_age(age):
    if age >= 18 and age <= 30:
        return 1  # Group 1: 18-30
    elif age > 30 and age <= 45:
        return 2  # Group 2: 31-45
    elif age > 45 and age <= 60:
        return 3  # Group 3: 46-60
    else:
        return 4  #
# Create the 'AgeGroup' column based on age ranges


In [7]:
def categorize_blood_status(systolic_value):
    normal_threshold = 120
    if systolic_value < normal_threshold:
        return 1 #normal
    else:
        return 2 #abnormal

# Create the 'blood status' column based on 'systolic'


In [8]:
def calculate_lifestyle_factors(row):
    age_score = (row['age'] <= 30).astype(int)  # Age 30 or younger
    bmi_score = (row['BMI'] <= 25).astype(int)  # BMI 25 or lower
    blood_status_score = (row['blood status'] == 1).astype(int)  # Blood status 1
    cholesterol_ratio_score = (row['CholesterolRatio'] <= 4.0).astype(int)  # Cholesterol ratio 4.0 or lower
    kidney_function_score = (row['KidneyFunctionIndicator'] <= 0.2).astype(int)  # Kidney function <= 0.2

    return age_score + bmi_score + blood_status_score + cholesterol_ratio_score + kidney_function_score

## Read File and Info

In [9]:
data_dict = read_all_file_paths()

In [10]:
data_dict

{'sample_submission': 'kaggle/input\\sample_submission.csv',
 'test': 'kaggle/input\\test.csv',
 'train': 'kaggle/input\\train.csv'}

In [11]:
data = pd.read_csv(data_dict["train"]).drop(['id'],axis=1)


In [12]:
correlation_matrix = data.corr()
correlation_with_target = correlation_matrix['smoking']
correlation_with_target

age                   -0.206033
height(cm)             0.447111
weight(kg)             0.351748
waist(cm)              0.262715
eyesight(left)         0.100420
eyesight(right)        0.109781
hearing(left)         -0.038219
hearing(right)        -0.036858
systolic               0.058642
relaxation             0.109501
fasting blood sugar    0.096534
Cholesterol           -0.051896
triglyceride           0.331975
HDL                   -0.271186
LDL                   -0.072285
hemoglobin             0.450679
Urine protein         -0.028548
serum creatinine       0.272979
AST                    0.059394
ALT                    0.163016
Gtp                    0.305561
dental caries          0.106636
smoking                1.000000
Name: smoking, dtype: float64

## Add Feature

In [13]:
data['BMI'] = data['weight(kg)'] / ((data['height(cm)']/100) ** 2)
data['blood status'] = data['systolic'].apply(categorize_blood_status)
data['CholesterolRatio'] = data['HDL'] / data['LDL']
data['KidneyFunctionIndicator'] = data['serum creatinine'] * ((data['AST'] + data['ALT']) / 2)
data['LifestyleFactors'] = data.apply(calculate_lifestyle_factors, axis=1)



## Training DataSet

In [14]:
X, y = data.drop(['smoking'],axis=1), data['smoking']
# X = preprocess_min_max(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [15]:
#{'random_state': 20, 'learning_rate': 0.1, 'iterations': 1000, 'depth': 6}
param_dist = {
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [6, 8, 10],
    'iterations': [100, 200, 300,1000],
    'random_state':[20,24,42]
    
}
# Create the CatBoostClassifier
catboost = CatBoostClassifier()

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(catboost, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=42)
random_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

0:	learn: 0.6397731	total: 200ms	remaining: 19.8s
1:	learn: 0.6011040	total: 267ms	remaining: 13.1s
2:	learn: 0.5711612	total: 335ms	remaining: 10.8s
3:	learn: 0.5484385	total: 394ms	remaining: 9.46s
4:	learn: 0.5308496	total: 452ms	remaining: 8.58s
5:	learn: 0.5167594	total: 506ms	remaining: 7.93s
6:	learn: 0.5052844	total: 561ms	remaining: 7.45s
7:	learn: 0.4968644	total: 615ms	remaining: 7.08s
8:	learn: 0.4896813	total: 672ms	remaining: 6.8s
9:	learn: 0.4834758	total: 739ms	remaining: 6.65s
10:	learn: 0.4782585	total: 795ms	remaining: 6.43s
11:	learn: 0.4738708	total: 855ms	remaining: 6.27s
12:	learn: 0.4698202	total: 918ms	remaining: 6.15s
13:	learn: 0.4665620	total: 976ms	remaining: 5.99s
14:	learn: 0.4637441	total: 1.03s	remaining: 5.87s
15:	learn: 0.4614135	total: 1.09s	remaining: 5.73s
16:	learn: 0.4595092	total: 1.14s	remaining: 5.59s
17:	learn: 0.4574058	total: 1.2s	remaining: 5.46s
18:	learn: 0.4556275	total: 1.25s	remaining: 5.34s
19:	learn: 0.4539682	total: 1.31s	remaining

In [26]:
best_params

{'random_state': 20, 'learning_rate': 0.1, 'iterations': 1000, 'depth': 6}

## Accuracy

In [22]:
from sklearn.metrics import roc_curve, roc_auc_score
test_predictions = best_estimator.predict_proba(X_test)[:, 1]
y_pred_binary = best_estimator.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_binary)
roc_auc #0.8634032005983142

0.8640529725548854

## TEST

In [23]:
best_estimator.fit(X,y)


0:	learn: 0.6465200	total: 22.6ms	remaining: 22.6s
1:	learn: 0.6100053	total: 60.7ms	remaining: 30.3s
2:	learn: 0.5813123	total: 84.7ms	remaining: 28.2s
3:	learn: 0.5587548	total: 108ms	remaining: 26.8s
4:	learn: 0.5432521	total: 130ms	remaining: 25.9s
5:	learn: 0.5299815	total: 151ms	remaining: 25s
6:	learn: 0.5193257	total: 171ms	remaining: 24.3s
7:	learn: 0.5114576	total: 192ms	remaining: 23.8s
8:	learn: 0.5046072	total: 213ms	remaining: 23.4s
9:	learn: 0.4987839	total: 249ms	remaining: 24.6s
10:	learn: 0.4948685	total: 267ms	remaining: 24s
11:	learn: 0.4909078	total: 285ms	remaining: 23.5s
12:	learn: 0.4879470	total: 303ms	remaining: 23s
13:	learn: 0.4845699	total: 321ms	remaining: 22.6s
14:	learn: 0.4815824	total: 341ms	remaining: 22.4s
15:	learn: 0.4798920	total: 361ms	remaining: 22.2s
16:	learn: 0.4778766	total: 381ms	remaining: 22s
17:	learn: 0.4758613	total: 399ms	remaining: 21.8s
18:	learn: 0.4743281	total: 417ms	remaining: 21.6s
19:	learn: 0.4727781	total: 434ms	remaining: 2

<catboost.core.CatBoostClassifier at 0x2786f350>

In [24]:
df_test = pd.read_csv(data_dict["test"])
X_test = df_test.drop(['id'], axis = 1)
X_test['BMI'] = X_test['weight(kg)'] / ((X_test['height(cm)']/100) ** 2)
X_test['blood status'] = X_test['systolic'].apply(categorize_blood_status)
X_test['CholesterolRatio'] = X_test['HDL'] / X_test['LDL']
X_test['KidneyFunctionIndicator'] = X_test['serum creatinine'] * ((X_test['AST'] + X_test['ALT']) / 2)
X_test['LifestyleFactors'] = X_test.apply(calculate_lifestyle_factors, axis=1)



# X['AgeGroup'] = X['age'].apply(categorize_age)
test_predictions =  best_estimator.predict_proba(X_test)[:, 1]
submission = pd.DataFrame({'id': df_test['id'], 'smoking': test_predictions})
submission.to_csv('submission.csv', index=False)

In [25]:
submission.head()

Unnamed: 0,id,smoking
0,159256,0.592616
1,159257,0.292607
2,159258,0.335983
3,159259,0.013545
4,159260,0.645693
