In [1]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()
data['age'] = data['age'] / 365

In [4]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(x_train, y_train)

dummy_clf.score(x_val, y_val)

0.4988392857142857

In [6]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

dt_clf.score(x_val, y_val)

0.631875

In [7]:
#import min max scaler, one hot encoder, and column transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
# pode deixar dessa forma, se tiver mt ruim a gente muda
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)
var_to_be_used = ['ap_hi', 'ap_lo', 'age', 'weight', 'gluc', 'cholesterol', 'smoke', 'alco', 'active']

numerical_features = ['ap_hi', 'ap_lo', 'weight', 'age']
categorical_features = ['gluc', 'cholesterol', 'smoke', 'alco', 'active',]

# create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

data_transformed = preprocessor.fit_transform(x_train)
val_transformed = preprocessor.transform(x_val)

In [10]:
# xgb_clf = xgb.XGBClassifier()
import xgboost as xgb
import optuna
#import accuracy score
from sklearn.metrics import accuracy_score, precision_score
def objective(trial):
    """Define the objective function"""

    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 1, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 100),
        'splitter': trial.suggest_categorical('splitter', ['best', 'random']),
        'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 2, 100),
        'max_features' : trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2', None]),
    }

    # Fit the model
    optuna_model = DecisionTreeClassifier(**params)
    optuna_model.fit(data_transformed, y_train)

    # Make predictions
    y_pred = optuna_model.predict(val_transformed)

    # Evaluate predictions
    accuracy = precision_score(y_val, y_pred)

    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=8, timeout=600 * 3)

[32m[I 2023-04-20 17:16:54,556][0m A new study created in memory with name: no-name-40b374a9-97d0-4bed-bfa9-945d0a0af13b[0m
[32m[I 2023-04-20 17:16:54,621][0m Trial 1 finished with value: 0.6813868613138686 and parameters: {'criterion': 'gini', 'max_depth': 80, 'min_samples_split': 21, 'min_samples_leaf': 25, 'splitter': 'random', 'max_features': 'auto', 'max_leaf_nodes': 13}. Best is trial 1 with value: 0.6813868613138686.[0m
[32m[I 2023-04-20 17:16:54,633][0m Trial 2 finished with value: 0.6199807877041307 and parameters: {'criterion': 'gini', 'max_depth': 98, 'min_samples_split': 73, 'min_samples_leaf': 38, 'splitter': 'random', 'max_features': 'sqrt', 'max_leaf_nodes': 20}. Best is trial 1 with value: 0.6813868613138686.[0m
[32m[I 2023-04-20 17:16:54,643][0m Trial 4 finished with value: 0.6142160540135033 and parameters: {'criterion': 'gini', 'max_depth': 16, 'min_samples_split': 23, 'min_samples_leaf': 29, 'splitter': 'random', 'max_features': 'auto', 'max_leaf_nodes': 

In [11]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.7906976744186046
  Params: 
    criterion: gini
    max_depth: 12
    min_samples_split: 91
    min_samples_leaf: 84
    splitter: best
    max_features: log2
    max_leaf_nodes: 43


In [12]:
trial.params

{'criterion': 'gini',
 'max_depth': 12,
 'min_samples_split': 91,
 'min_samples_leaf': 84,
 'splitter': 'best',
 'max_features': 'log2',
 'max_leaf_nodes': 43}

In [8]:
params = trial.params

model = DecisionTreeClassifier(**params)

In [9]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, data_transformed, y_train, cv=kf, scoring='precision', n_jobs=8)
scores_acc = cross_val_score(model, data_transformed, y_train, cv=kf, scoring='accuracy', n_jobs=8)

print('precision: {:.2f} +/- {:.2f}'.format(scores.mean(), scores.std()))
print('accuracy: {:.2f} +/- {:.2f}'.format(scores_acc.mean(), scores_acc.std()))

precision: 0.75 +/- 0.02
accuracy: 0.72 +/- 0.01


In [13]:
full_data = preprocessor.transform(data.drop('cardio', axis=1))
model.fit(full_data, data['cardio'])

In [14]:
decison_tree_pred = model.predict(val_transformed)

pd.DataFrame(decison_tree_pred).to_csv('decision_tree.csv', index=False, header=False)

In [16]:
model.fit(data_transformed, y_train)
precision_score(y_val, model.predict(val_transformed))

0.7631211335062862

In [18]:
data_test = pd.read_csv('test.csv')
data_test.head()
data_test['age'] = round(data_test['age'] / 365)

In [19]:
data_test_transformed = preprocessor.transform(data_test.drop('cardio', axis=1))
data_test_transformed

array([[0.01670792, 0.01390268, 0.28421053, ..., 0.        , 1.        ,
        0.        ],
       [0.01608911, 0.01489573, 0.36842105, ..., 0.        , 0.        ,
        1.        ],
       [0.01608911, 0.01489573, 0.26315789, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.01670792, 0.01489573, 0.48421053, ..., 1.        , 1.        ,
        0.        ],
       [0.0154703 , 0.01390268, 0.25789474, ..., 0.        , 0.        ,
        1.        ],
       [0.01608911, 0.01390268, 0.38421053, ..., 0.        , 0.        ,
        1.        ]])

In [21]:
y_pred = model.predict(data_test_transformed)
test_target = data_test['cardio']
precision_score(test_target, y_pred)

0.7819645143588806