In [1]:
import pandas as pd

from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()
data['age'] = data['age'] / 365

In [4]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(x_train, y_train)

dummy_clf.score(x_val, y_val)

0.4988392857142857

In [6]:

from sklearn.naive_bayes import GaussianNB

nb_clf = GaussianNB()
nb_clf.fit(x_train,y_train)

nb_clf.score(x_val,y_val)

0.5578571428571428

In [13]:

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import CategoricalNB

x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)
var_to_be_used = ['ap_hi', 'ap_lo', 'age', 'weight', 'gluc', 'cholesterol', 'smoke', 'alco', 'active']

numerical_features = ['ap_hi', 'ap_lo', 'weight', 'age']
categorical_features = ['gluc', 'cholesterol', 'smoke', 'alco', 'active',]


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),

    ]
)

data_transformed = preprocessor.fit_transform(x_train)
val_transformed = preprocessor.transform(x_val)

In [8]:
data_transformed.shape

(44800, 4)

In [85]:

import xgboost as xgb
import optuna

from sklearn.metrics import accuracy_score, precision_score
def objective(trial):

    params = {
        'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
    }

    # Fit the model
    optuna_model = GaussianNB(**params)
    optuna_model.fit(data_transformed, y_train)

    # Make predictions
    y_pred = optuna_model.predict(val_transformed)

    # Evaluate predictions
    precision = precision_score(y_val, y_pred)

    return precision
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=8, timeout=600 * 3)

[32m[I 2023-04-20 17:04:39,081][0m A new study created in memory with name: no-name-0d198b1a-48c1-4da4-8024-ed8a6916f26b[0m
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
[32m[I 2023-04-20 17:04:39,118][0m Trial 5 finished with value: 0.7389558232931727 and parameters: {'var_smoothing': 0.06440806151646028}. Best is trial 5 with value: 0.7389558232931727.[0m
  'var_smoothing': trial.suggest_uniform('var_smoothing', 0.0, 1.0),
[32m[I 2023-04-20 17:04:39,123][0m Trial 0 finished with 

In [86]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

Number of finished trials: 1000
Best trial:
  Value: 0.7439271255060729
  Params: 
    var_smoothing: 0.6906875591579865


In [9]:
params = trial.params

nb_clf = GaussianNB(**params)

In [10]:
params

{'var_smoothing': 0.6906875591579865}

In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)

from sklearn.model_selection import cross_val_score


scores = cross_val_score(nb_clf, data_transformed, y_train, cv=kf, scoring='precision', n_jobs=8)
scores_acc = cross_val_score(nb_clf, data_transformed, y_train, cv=kf, scoring='accuracy', n_jobs=8)

print('precision: {:.2f} +/- {:.2f}'.format(scores.mean(), scores.std()))
print('accuracy: {:.2f} +/- {:.2f}'.format(scores_acc.mean(), scores_acc.std()))

precision: 0.75 +/- 0.01
accuracy: 0.55 +/- 0.01


In [16]:
full_data = preprocessor.transform(data.drop('cardio', axis=1))
nb_clf.fit(full_data, data['cardio'])
naive_bayes_pred = nb_clf.predict(val_transformed)

pd.DataFrame(naive_bayes_pred).to_csv('naive_bayes.csv', index=False, header=False)

In [107]:
data_test = pd.read_csv('test.csv')
data_test['age'] = data_test['age'] / 365

In [108]:
test_target = data_test['cardio']

In [109]:
data_test['cardio'].shape

(14000,)

In [110]:
data_test_transformed = preprocessor.transform(data_test)
data_test_transformed

array([[ 0.00704968, -0.14038232, -0.70968504, -0.24983882],
       [-0.05777892, -0.08786342,  0.39849638,  1.23871443],
       [-0.05777892, -0.08786342, -0.98673039, -1.89766282],
       ...,
       [ 0.00704968, -0.08786342,  1.92224583, -0.51853221],
       [-0.12260752, -0.14038232, -1.05599173,  1.01500591],
       [-0.05777892, -0.14038232,  0.6062804 ,  0.62392127]])

In [111]:
y_pred = nb_clf.predict(data_test_transformed)
precision_score(test_target, y_pred)

0.7299444003177125

In [112]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])