In [1]:
import pandas as pd

from sklearn.dummy import DummyClassifier

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()
data['age'] = round(data['age'] / 365)
target = data['cardio']

In [4]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [5]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(x_train, y_train)

dummy_clf.score(x_val, y_val)

0.4988392857142857

In [6]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(x_train, y_train)

lr_clf.score(x_val, y_val)

0.6954464285714286

In [7]:

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


var_to_be_used = ['gluc', 'cholesterol', 'ap_hi', 'age', 'ap_lo', 'weight']
numerical_features = ['ap_hi', 'ap_lo', 'age', 'weight']
categorical_features = ['gluc', 'cholesterol']


preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

x_train = preprocessor.fit_transform(x_train)
x_val = preprocessor.transform(x_val)

In [8]:
import optuna

from sklearn.metrics import accuracy_score, precision_score
def objective(trial):

    params = {
        'C': trial.suggest_loguniform('C', 1e-3, 1e3),
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
        'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)
    }


    optuna_model = LogisticRegression(**params)
    optuna_model.fit(x_train, y_train)


    y_pred = optuna_model.predict(x_val)


    precision = precision_score(y_val, y_pred)
    return precision
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=8, timeout=600 * 3)

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2023-04-20 17:31:56,364][0m A new study created in memory with name: no-name-5a433d3e-3f76-4946-b28b-af1700e465af[0m
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
[32m[I 2023-04-20 17:31:56,621][0m Trial 2 finished with value: 0.6500377928949358 and parameters: {'C': 0.12565452644044972, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 378}. Best is trial 2 with value: 0.6500377928949358.[0m
  'C': trial.suggest_loguniform('C', 1e-3, 1e3),
[32m[I 2023-04-20 17:31:56,694][0m Trial 7 finished with value: 0.6505295007564297 and parameters: {'C': 0.13311171213926917, 'penalty': 'l2', 'solver': 'saga', 'max_iter': 375}. Best is trial 7 w

In [8]:
trial = study.best_trial
params = trial.params

lr_clf = LogisticRegression(**params)

In [9]:
params

{'C': 57.937754555854134,
 'penalty': 'l1',
 'solver': 'liblinear',
 'max_iter': 443}

In [10]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)


from sklearn.model_selection import cross_val_score


scores = cross_val_score(lr_clf, x_train, y_train, cv=kf, scoring='precision', n_jobs=8)
scores_accuracy = cross_val_score(lr_clf, x_train, y_train, cv=kf, scoring='accuracy', n_jobs=8)


print('precision: {:.2f} +/- {:.2f}'.format(scores.mean(), scores.std()))
print('accuracy: {:.2f} +/- {:.2f}'.format(scores_accuracy.mean(), scores_accuracy.std()))

precision: 0.74 +/- 0.01
accuracy: 0.72 +/- 0.00


In [12]:
full_data = preprocessor.transform(data.drop('cardio', axis=1))
lr_clf.fit(full_data, data['cardio'])
logistic_regression_pred = lr_clf.predict(x_val)

pd.DataFrame(logistic_regression_pred).to_csv('logistic_regression.csv', index=False, header=False)

In [17]:
data_test = pd.read_csv('test.csv')
test_target = data_test['cardio']
data_test['age'] = round(data_test['age'] / 365)
data_test.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
1,15,62.0,1,169,80.0,120,80,1,1,0,0,1,0
2,18,41.0,2,165,60.0,120,80,1,1,0,0,0,0
3,24,46.0,2,172,112.0,120,80,1,1,0,0,0,1
4,31,59.0,1,157,69.0,130,80,1,1,0,0,1,0


In [18]:
data_test['cardio'].shape

(14000,)

In [19]:
x_test = preprocessor.transform(data_test)
y_test = lr_clf.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score
precision_score(data_test['cardio'], y_test)

0.7444515921518173