In [13]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, log_loss, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier

import warnings
warnings.filterwarnings("ignore")

In [14]:
df = pd.read_csv('Kyphosis.csv')
df.sample(5)

Unnamed: 0,Kyphosis,Age,Number,Start
41,absent,35,3,13
31,absent,125,2,11
39,present,91,5,12
17,absent,175,5,13
21,present,105,6,5


In [15]:
X = df.drop('Kyphosis', axis=1)
y = df['Kyphosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=24)

In [16]:
lr = LogisticRegression()
bagg = BaggingClassifier(estimator=lr, n_estimators=15, random_state=24, oob_score=True)
bagg.fit(X_train, y_train)

In [17]:
y_pred = bagg.predict(X_test)
y_pred_prob = bagg.predict_proba(X_test)[:, 1]
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred_prob))
print(bagg.oob_score)

0.76
0.4406069140915052
True


In [18]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=24)
bagg = BaggingClassifier(estimator=lr, n_estimators=15, random_state=24, oob_score=True)
params = {
    'estimator__penalty': ['l2', None],
    'estimator__C': [0.001, 10, 5],
    'estimator__solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag']
}
gcv = GridSearchCV(bagg, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'estimator__C': 5, 'estimator__penalty': 'l2', 'estimator__solver': 'sag'}
-0.4156956709419767


In [22]:
lr = LogisticRegression()
nb = GaussianNB()
svm = SVC(random_state=24, probability=True)
dtc = DecisionTreeClassifier(random_state=24)

bagg = BaggingClassifier(random_state=24)
params = {
    'estimator': [lr, nb, svm, dtc],
    'n_estimators': [10,15]
}
gcv = GridSearchCV(bagg, param_grid=params, cv=kfold, scoring='neg_log_loss')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

{'estimator': GaussianNB(), 'n_estimators': 15}
-0.39593694921625355
