In [42]:
import pandas as pd
# import dummy classifier
from sklearn.dummy import DummyClassifier
# import train_test_split
from sklearn.model_selection import train_test_split

In [43]:
data = pd.read_csv('cardio_train.csv', sep=',', index_col=0)

In [44]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [45]:
x_train, x_val, y_train, y_val = train_test_split(data.drop('cardio', axis=1), data['cardio'], test_size=0.2, random_state=0)

In [46]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(x_train, y_train)

dummy_clf.score(x_val, y_val)

0.49507142857142855

In [47]:
#Random Forest algorithm
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)

rf_clf.score(x_val, y_val)

0.7241428571428571

In [48]:
#import min max scaler, one hot encoder, and column transformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

var_to_be_used = ['gluc', 'cholesterol', 'ap_hi', 'age', 'ap_lo', 'weight']
numerical_features = ['ap_hi', 'ap_lo', 'age', 'weight']
categorical_features = ['gluc', 'cholesterol']

# create column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

x_train = preprocessor.fit_transform(x_train)
x_val = preprocessor.transform(x_val)

In [49]:
# import ransearchcv
from sklearn.model_selection import RandomizedSearchCV
# define the parameters
params = {
    'max_depth': [3, 5, 7, 9, 11, 13, 15],
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'criterion':['gini', 'entropy', 'log_loss']
}
# create the model
rf_clf = RandomForestClassifier()
# create the random search cv
random_search = RandomizedSearchCV(rf_clf, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3)
# fit the model
random_search.fit(x_train, y_train)

random_search.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


{'n_estimators': 500,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 9,
 'criterion': 'log_loss'}

In [50]:
# train randomforest with the best params
rf_clf = RandomForestClassifier(**random_search.best_params_)
rf_clf.fit(x_train, y_train)

In [51]:
rf_clf.score(x_train, y_train)


0.7420535714285714

In [52]:
rf_clf.score(x_val, y_val)


0.7317142857142858

In [53]:
data_test = pd.read_csv('test.csv')


In [54]:
x_test = preprocessor.transform(data_test)
y_test = rf_clf.predict(x_test)

In [55]:
rf_clf.score

<bound method ClassifierMixin.score of RandomForestClassifier(criterion='log_loss', max_depth=9, min_samples_leaf=4,
                       min_samples_split=5, n_estimators=500)>

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(data_test['cardio'], y_test)

0.7352142857142857