In [1]:
import pandas as pd

from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv', sep=',', index_col=0)

In [3]:
data['age'] = round(data['age'] / 365)
target = data['cardio']
data.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
66728,60.0,1,156,64.0,140,80,2,1,0,0,1,1
69098,60.0,1,170,85.0,160,90,1,1,0,0,1,1
59185,64.0,1,151,90.0,130,80,1,1,0,0,1,1
49288,54.0,1,159,97.0,120,80,1,1,0,0,1,1
62481,50.0,1,164,68.0,120,80,1,1,0,0,1,0


In [4]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(x_train, y_train)

dummy_clf.score(x_val, y_val)

0.4988392857142857

In [6]:
#Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)

rf_clf.score(x_val, y_val)

0.7125

In [7]:

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

var_to_be_used = ['gluc', 'cholesterol', 'ap_hi', 'age', 'ap_lo', 'weight']
numerical_features = ['ap_hi', 'ap_lo', 'age', 'weight']
categorical_features = ['gluc', 'cholesterol']

# create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

x_train = preprocessor.fit_transform(x_train)
x_val = preprocessor.transform(x_val)

In [10]:
import optuna

from sklearn.metrics import accuracy_score
def objective(trial):
    """Define the objective function"""

    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 20),
        'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 0.0, 0.5),
    }


    optuna_model = RandomForestClassifier(**params)
    optuna_model.fit(x_train, y_train)


    y_pred = optuna_model.predict(x_val)


    precision = precision_score(y_val, y_pred)
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=8, timeout=600 * 3)

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-04-19 17:31:32,031][0m A new study created in memory with name: no-name-69238499-6bc6-429a-83d1-e7974b09f196[0m
  warn(
[32m[I 2023-04-19 17:31:34,747][0m Trial 2 finished with value: 0.7221428571428572 and parameters: {'n_estimators': 157, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 10, 'max_features': 'sqrt', 'bootstrap': False, 'criterion': 'gini', 'max_leaf_nodes': 6, 'min_weight_fraction_leaf': 0.3661706702205361}. Best is trial 2 with value: 0.7221428571428572.[0m
[32m[I 2023-04-19 17:31:35,069][0m Trial 1 finished with value: 0.7197321428571428 and parameters: {'n_estimators': 139, 'max_depth': 11, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'bootstrap': True, 'criterion': 'gini', 'max_leaf_nodes': 20, 'min_weight_fraction_leaf': 0.27462269158877056}. Best is trial 2 with value: 0.7221428571428572.[0m
[32m[I 2023-04-19 17:31:36,459][0m Trial 3 finished with value:

In [8]:
trial = study.best_trial
params = trial.params

rf_clf = RandomForestClassifier(**params)

In [9]:
params

{'n_estimators': 158,
 'max_depth': 7,
 'min_samples_split': 10,
 'min_samples_leaf': 7,
 'max_features': 'auto',
 'bootstrap': False,
 'criterion': 'gini',
 'max_leaf_nodes': 14,
 'min_weight_fraction_leaf': 0.007871254791640473}

In [10]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=0)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_clf, x_train, y_train, cv=kf, scoring='accuracy', n_jobs=8)
precision_scores = cross_val_score(rf_clf, x_train, y_train, cv=kf, scoring='precision', n_jobs=8)

print('Accuracy: {:.2f} +/- {:.2f}'.format(scores.mean(), scores.std()))
print('Precision: {:.2f} +/- {:.2f}'.format(precision_scores.mean(), precision_scores.std()))

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Accuracy: 0.73 +/- 0.01
Precision: 0.76 +/- 0.01


In [11]:
full_data = preprocessor.transform(data.drop('cardio', axis=1))
rf_clf.fit(full_data, data['cardio'])
random_forest_pred = rf_clf.predict(x_val)

pd.DataFrame(random_forest_pred).to_csv('random_forest.csv', index=False, header=False)

  warn(


In [18]:
rf_clf.score(x_val, y_val)


0.7358928571428571

In [19]:
data_test = pd.read_csv('test.csv')
test_target = data_test['cardio']
data_test['age'] = round(data_test['age'] / 365)
data_test.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
1,15,62.0,1,169,80.0,120,80,1,1,0,0,1,0
2,18,41.0,2,165,60.0,120,80,1,1,0,0,0,0
3,24,46.0,2,172,112.0,120,80,1,1,0,0,0,1
4,31,59.0,1,157,69.0,130,80,1,1,0,0,1,0


In [20]:
x_test = preprocessor.transform(data_test)
y_test = rf_clf.predict(x_test)

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(data_test['cardio'], y_test)

0.7250714285714286