In [1]:
import pandas as pd
# import dummy classifier
from sklearn.dummy import DummyClassifier
# import train_test_split
from sklearn.model_selection import train_test_split
data = pd.read_csv('train.csv', sep=',', index_col=0)

In [2]:
data['age'] = round(data['age'] / 365)
target = data['cardio']
data.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
66728,60.0,1,156,64.0,140,80,2,1,0,0,1,1
69098,60.0,1,170,85.0,160,90,1,1,0,0,1,1
59185,64.0,1,151,90.0,130,80,1,1,0,0,1,1
49288,54.0,1,159,97.0,120,80,1,1,0,0,1,1
62481,50.0,1,164,68.0,120,80,1,1,0,0,1,0


In [3]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [4]:
#import min max scaler, one hot encoder, and column transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

var_to_be_used = ['gluc', 'cholesterol', 'ap_hi', 'age', 'ap_lo', 'weight']
numerical_features = ['ap_hi', 'ap_lo', 'age', 'weight']
categorical_features = ['gluc', 'cholesterol']

# create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

preprocessor_naive = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        # ('cat', OneHotEncoder(), categorical_features)
    ]
)

x_train_all = preprocessor.fit_transform(data.drop('cardio', axis=1))
x_naive = preprocessor_naive.fit_transform(data.drop('cardio', axis=1))

In [5]:
random_forest_params = {'n_estimators': 158,
 'max_depth': 7,
 'min_samples_split': 10,
 'min_samples_leaf': 7,
 'max_features': 'auto',
 'bootstrap': False,
 'criterion': 'gini',
 'max_leaf_nodes': 14,
 'min_weight_fraction_leaf': 0.007871254791640473}

knn_params = {'n_neighbors': 18,
 'weights': 'uniform',
 'algorithm': 'ball_tree',
 'leaf_size': 100,
 'p': 9}

logistic_regression_params = {'C': 57.937754555854134,
 'penalty': 'l1',
 'solver': 'liblinear',
 'max_iter': 443}

xgboost_params = {'max_depth': 56,
 'learning_rate': 0.8362189302931353,
 'n_estimators': 469,
 'min_child_weight': 37,
 'gamma': 1.2174695610100952e-05,
 'subsample': 0.3973219220957762,
 'colsample_bytree': 0.055240716417472226,
 'reg_alpha': 0.0003095006245940739,
 'reg_lambda': 0.004388821550143689}

naive_bayes_params = {'var_smoothing': 0.6906875591579865}

decision_tree_params = {'criterion': 'gini',
 'max_depth': 12,
 'min_samples_split': 91,
 'min_samples_leaf': 84,
 'splitter': 'best',
 'max_features': 'log2',
 'max_leaf_nodes': 43}

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# create the models
random_forest = RandomForestClassifier(**random_forest_params)
knn = KNeighborsClassifier(**knn_params)
logistic_regression = LogisticRegression(**logistic_regression_params)
xgboost = XGBClassifier(**xgboost_params)
naive_bayes = GaussianNB(**naive_bayes_params)
decision_tree = DecisionTreeClassifier(**decision_tree_params)

# fit the models
random_forest.fit(x_train_all, data['cardio'])
knn.fit(x_train_all, data['cardio'])
logistic_regression.fit(x_train_all, data['cardio'])
xgboost.fit(x_train_all, data['cardio'])
naive_bayes.fit(x_naive, data['cardio'])
decision_tree.fit(x_train_all, data['cardio'])


# predict the models
random_forest_pred = random_forest.predict(x_train_all)
knn_pred = knn.predict(x_train_all)
logistic_regression_pred = logistic_regression.predict(x_train_all)
xgboost_pred = xgboost.predict(x_train_all)
naive_bayes_pred = naive_bayes.predict(x_naive)
decision_tree_pred = decision_tree.predict(x_train_all)

  warn(


In [6]:
models = [random_forest, knn, logistic_regression, xgboost]

In [7]:
results = pd.DataFrame({'Random Forest': random_forest_pred, 'KNN': knn_pred, 'Logistic Regression': logistic_regression_pred, 'XGBoost': xgboost_pred, 'Naive Bayes': naive_bayes_pred, 'Decision Tree': decision_tree_pred, 'Target': data['cardio']})

In [8]:
results.head()

Unnamed: 0_level_0,Random Forest,KNN,Logistic Regression,XGBoost,Naive Bayes,Decision Tree,Target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
66728,1,0,1,1,0,1,1
69098,1,1,1,1,0,1,1
59185,1,1,1,1,1,1,1
49288,0,1,0,0,0,1,1
62481,0,0,0,0,0,0,0


In [9]:
x_train, x_val, y_train, y_val = train_test_split(results.drop('Target', axis=1), results['Target'], test_size=0.2, random_state=0)

In [11]:
# create a logistic regression model to predict the target
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.metrics import accuracy_score, precision_score

# ensemble_model = LogisticRegression()
def objective(trial):
    """Define the objective function"""

    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)
    }

    # Fit the model
    optuna_model = LogisticRegression(**params)
    optuna_model.fit(x_train, y_train)

    # Make predictions
    y_pred = optuna_model.predict(x_val)

    # Evaluate predictions
    accuracy = precision_score(y_val, y_pred)
    return accuracy
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, n_jobs=8, timeout=600 * 3)

[32m[I 2023-04-21 15:51:11,465][0m A new study created in memory with name: no-name-1045bc70-1938-47f1-9bf9-f35ba9b00b57[0m
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
[32m[I 2023-04-21 15:51:11,537][0m Trial 1 finished with value: 0.7584818861414606 and parameters: {'C': 0.007734154396819996, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 861}. Best is trial 1 with value: 0.7584818861414606.[0m
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
[32m[I 2023-04-21 15:51:11,569][0m Trial 0 finished with value: 0.7584355828220859 and parameters: {'C': 64.47640792854759, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 784}. Best is t

In [12]:
# train lr with the best params
trial = study.best_trial
params = trial.params
lr_clf = LogisticRegression(**params)
# lr_clf.fit(x_train, y_train)

In [13]:
params

{'C': 0.0023524548179123697,
 'penalty': 'l2',
 'solver': 'saga',
 'max_iter': 222}

In [14]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10, shuffle=True, random_state=0)

# import cross_val_score
from sklearn.model_selection import cross_val_score

# evaluate the model using 10-fold cross-validation
scores = cross_val_score(lr_clf, x_train, y_train, cv=kf, scoring='precision', n_jobs=8)

#compute the mean of the scores and the standard deviation
print('precision: {:.2f} +/- {:.2f}'.format(scores.mean(), scores.std()))

precision: 0.76 +/- 0.01


In [15]:
lr_clf.fit(results.drop('Target', axis=1), results['Target'])
# lr_clf.score(x_val, y_val)

In [31]:
data_test = pd.read_csv('test.csv')
test_target = data_test['cardio']
data_test['age'] = round(data_test['age'] / 365)
data_test.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
1,15,62.0,1,169,80.0,120,80,1,1,0,0,1,0
2,18,41.0,2,165,60.0,120,80,1,1,0,0,0,0
3,24,46.0,2,172,112.0,120,80,1,1,0,0,0,1
4,31,59.0,1,157,69.0,130,80,1,1,0,0,1,0


In [32]:
data_test['cardio'].shape

(14000,)

In [33]:
data_test.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
1,15,62.0,1,169,80.0,120,80,1,1,0,0,1,0
2,18,41.0,2,165,60.0,120,80,1,1,0,0,0,0
3,24,46.0,2,172,112.0,120,80,1,1,0,0,0,1
4,31,59.0,1,157,69.0,130,80,1,1,0,0,1,0


In [34]:
x_test = preprocessor.transform(data_test)
x_naive = preprocessor_naive.transform(data_test)

In [35]:
x_test.shape

(14000, 10)

In [37]:
random_forest_pred = random_forest.predict(x_test)
knn_pred = knn.predict(x_test)
logistic_regression_pred = logistic_regression.predict(x_test)
xgboost_pred = xgboost.predict(x_test)
naive_bayes_pred = naive_bayes.predict(x_naive)
decision_tree_pred = decision_tree.predict(x_test)

In [38]:

test_results = pd.DataFrame({'Random Forest': random_forest_pred, 'KNN': knn_pred, 'Logistic Regression': logistic_regression_pred, 'XGBoost': xgboost_pred, 'Naive Bayes': naive_bayes_pred, 'Decision Tree': decision_tree_pred, 'Target': data_test['cardio']})

In [39]:
test_results.head()

Unnamed: 0,Random Forest,KNN,Logistic Regression,XGBoost,Naive Bayes,Decision Tree,Target
0,1,1,1,1,0,1,1
1,0,1,1,0,0,1,0
2,0,0,0,0,0,0,0
3,0,1,0,0,0,0,1
4,1,0,1,1,0,1,0


In [40]:
precision_score(test_target, lr_clf.predict(test_results.drop('Target', axis=1)))

0.7491360351869306