보팅분류기

In [112]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [113]:
cancer = load_breast_cancer()
data_df = pd.DataFrame(cancer.data, columns= cancer.feature_names)

In [114]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(cancer.data
                                                    , cancer.target
                                                    , test_size = 0.2
                                                    , random_state= 11)

In [115]:
# 분류기 생성하기
lr_clf = LogisticRegression(solver='liblinear')
knn_clf = KNeighborsClassifier(n_neighbors=7)
vo_clf = VotingClassifier(estimators=[('LR',lr_clf), ('KNN', knn_clf)]
                          , voting='soft')

In [116]:
# 로지스틱
lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
acc = accuracy_score(y_test, lr_pred)
print(confusion_matrix(y_test, lr_pred))
print(f'정확도 :{acc:.2%}')

[[34  4]
 [ 2 74]]
정확도 :94.74%


In [None]:
# 최근접이웃분석
knn_clf.fit(X_train, y_train)
knn_pred = knn_clf.predict(X_test)
acc = accuracy_score(y_test, knn_pred)
print(confusion_matrix(y_test, knn_pred))
print(f'정확도 :{acc:.2%}')

[[33  5]
 [ 0 76]]
정확도 :95.61%


In [118]:
# 분류기 학습
vo_clf.fit(X_train, y_train)
vo_pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test, vo_pred)
print(confusion_matrix(y_test, vo_pred))
print(f'정확도 :{acc:.2%}')

[[34  4]
 [ 1 75]]
정확도 :95.61%


In [119]:
# 랜덤포레스트 모델 만들기
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=0, max_depth=8)
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
acc = accuracy_score(y_test, rf_pred)
print(acc)

0.9824561403508771


랜덤 포레스트 모델의 최적의 하이퍼파라미터 조합 찾기

In [120]:
# GridSerchCV - 하이퍼파라미터 + 교차검증
from sklearn.model_selection import GridSearchCV
rf_clf_2 = RandomForestClassifier()
params = {"max_depth":[1,2,3,8],'min_samples_split':[2,3]}
grid_dtree = GridSearchCV(rf_clf_2, param_grid= params, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

In [121]:
grid_dtree.best_params_

{'max_depth': 8, 'min_samples_split': 3}

In [122]:
grid_dtree.best_estimator_

In [123]:
b_model = grid_dtree.best_estimator_
pred = b_model.predict(X_test)
accuracy_score(y_test,pred)

0.9736842105263158

In [129]:
import re
def clean_feature_name(name):
    # 특수문자 제거: 괄호, 콤마, 하이픈 등
    return re.sub('[^A-Za-z0-9_]+', '_', name)

In [130]:
def get_new_feature_name_df(old_feature_name_df):
    feature_dup_df = pd.DataFrame(data=old_feature_name_df.groupby('column_name').cumcount(),
                                  columns=['dup_cnt'])
    feature_dup_df = feature_dup_df.reset_index()
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, how='outer')
    # new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x[0]+'_'+str(x[1]) 
    #   
    # 중복 이름 처리 + 특수문자 제거
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(
        lambda x: clean_feature_name(x[0] + ('_' + str(x[1]) if x[1] > 0 else '')), axis=1
    )
    new_feature_name_df = new_feature_name_df.drop(['index'], axis=1)
    return new_feature_name_df

In [131]:
feature_name_df = pd.read_csv('./data/UCI-HAR_Dataset/features.txt', sep='\s+' ,
                              header=None, names=['column_index', 'column_name'])

new_feature_name_df = get_new_feature_name_df(feature_name_df)

X_train = pd.read_csv('./data/UCI-HAR_Dataset/train/X_train.txt')
X_test =  pd.read_csv('./data/UCI-HAR_Dataset/test/X_test.txt')

  lambda x: clean_feature_name(x[0] + ('_' + str(x[1]) if x[1] > 0 else '')), axis=1


In [134]:
import pandas as pd

def get_human_dataset( ):
  feature_name_df = pd.read_csv('./data/UCI-HAR_Dataset/features.txt',sep='\s+',
                        header=None, names=['column_index','column_name'])
  new_feature_name_df = get_new_feature_name_df(feature_name_df)
  feature_name = new_feature_name_df.iloc[:, 1].values.tolist()
  X_train = pd.read_csv('./data/UCI-HAR_Dataset/train/X_train.txt',sep='\s+', names=feature_name )
  X_test = pd.read_csv('./data/UCI-HAR_Dataset/test/X_test.txt',sep='\s+', names=feature_name)
  y_train = pd.read_csv('./data/UCI-HAR_Dataset/train/y_train.txt',sep='\s+',header=None,names=['action'])
  y_test = pd.read_csv('./data/UCI-HAR_Dataset/test/y_test.txt',sep='\s+',header=None,names=['action'])
  
  return X_train, X_test, y_train, y_test

In [135]:
X_train, X_test, y_train, y_test = get_human_dataset()

print(X_train.shape, X_test.shape)
print(y_train['action'].value_counts())

  lambda x: clean_feature_name(x[0] + ('_' + str(x[1]) if x[1] > 0 else '')), axis=1


(7352, 561) (2947, 561)
action
6    1407
5    1374
4    1286
1    1226
2    1073
3     986
Name: count, dtype: int64


In [126]:
# GridSerchCV - 하이퍼파라미터 + 교차검증
from sklearn.model_selection import GridSearchCV
rf_clf_2 = RandomForestClassifier(random_state=2024)
params = {"max_depth":[20],'min_samples_split':[30],'min_samples_leaf': [3]}
grid_dtree = GridSearchCV(rf_clf_2, param_grid= params, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

In [None]:
grid_dtree.best_params_

{'max_depth': 20, 'min_samples_leaf': 3, 'min_samples_split': 30}

In [None]:
# {'max_depth': 15, 'min_samples_split': 3} random=42
b_model = grid_dtree.best_estimator_
pred = b_model.predict(X_test)
accuracy_score(y_test,pred)

0.9256871394638616

In [None]:
# {'max_depth': 20, 'min_samples_split': 30} random=42
b_model = grid_dtree.best_estimator_
pred = b_model.predict(X_test)
accuracy_score(y_test,pred)

0.9314557176789956

GBM

In [None]:
# GBM
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(random_state= 0)
gb_clf.fit(X_train, y_train)
pred = gb_clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(acc)

  y = column_or_1d(y, warn=True)


0.9392602646759416


XGBOOST

In [None]:
import xgboost

In [None]:
from sklearn.preprocessing import LabelEncoder
X_train, X_test, y_train, y_test = get_human_dataset()
X_train.shape, y_train.shape

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Series → 1D array
y_test_encoded = le.transform(y_test)

from xgboost import XGBClassifier
evals = [(X_test, y_test_encoded)]
xgb = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3, use_label_encoder=False)
xgb.fit(X_train, y_train_encoded, early_stopping_rounds=40, 
        eval_set=evals, eval_metric='mlogloss', verbose=True)
xgb_pred = xgb.predict(X_test)
print(classification_report(y_test, xgb_pred))
print(accuracy_score(y_test, xgb_pred))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[0]	validation_0-mlogloss:1.58932
[1]	validation_0-mlogloss:1.43267
[2]	validation_0-mlogloss:1.30322
[3]	validation_0-mlogloss:1.19383
[4]	validation_0-mlogloss:1.10067
[5]	validation_0-mlogloss:1.01957
[6]	validation_0-mlogloss:0.94662
[7]	validation_0-mlogloss:0.88249
[8]	validation_0-mlogloss:0.82727
[9]	validation_0-mlogloss:0.77601
[10]	validation_0-mlogloss:0.72925
[11]	validation_0-mlogloss:0.68936
[12]	validation_0-mlogloss:0.65128
[13]	validation_0-mlogloss:0.61844
[14]	validation_0-mlogloss:0.58748
[15]	validation_0-mlogloss:0.55894
[16]	validation_0-mlogloss:0.53410
[17]	validation_0-mlogloss:0.50956
[18]	validation_0-mlogloss:0.48868
[19]	validation_0-mlogloss:0.46809
[20]	validation_0-mlogloss:0.45075
[21]	validation_0-mlogloss:0.43385
[22]	validation_0-mlogloss:0.41768
[23]	validation_0-mlogloss:0.40316
[24]	validation_0-mlogloss:0.38937
[25]	validation_0-mlogloss:0.37645
[26]	validation_0-mlogloss:0.36394
[27]	validation_0-mlogloss:0.35407
[28]	validation_0-mlogloss:0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LGBM

In [None]:
import lightgbm

In [127]:
X_train, X_test, y_train, y_test = get_human_dataset()
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)  # Series → 1D array
y_test_encoded = le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [136]:
import numpy as np
from lightgbm import LGBMClassifier
evals = [(X_test, y_test_encoded)]
lgb = LGBMClassifier(n_estimators=400, objective='multiclass', 
                     num_class=len(np.unique(y_train_encoded)))
lgb.fit(X_train, y_train_encoded, early_stopping_rounds=40, 
        eval_set=evals, eval_metric='multi_logloss', verbose=True)
lgb_pred = lgb.predict(X_test)



[1]	valid_0's multi_logloss: 1.4404
[2]	valid_0's multi_logloss: 1.21574
[3]	valid_0's multi_logloss: 1.04795
[4]	valid_0's multi_logloss: 0.913299
[5]	valid_0's multi_logloss: 0.812686
[6]	valid_0's multi_logloss: 0.725964
[7]	valid_0's multi_logloss: 0.652995
[8]	valid_0's multi_logloss: 0.591598
[9]	valid_0's multi_logloss: 0.539383
[10]	valid_0's multi_logloss: 0.499944
[11]	valid_0's multi_logloss: 0.462273
[12]	valid_0's multi_logloss: 0.429676
[13]	valid_0's multi_logloss: 0.401908
[14]	valid_0's multi_logloss: 0.377718
[15]	valid_0's multi_logloss: 0.357455
[16]	valid_0's multi_logloss: 0.339918
[17]	valid_0's multi_logloss: 0.325799
[18]	valid_0's multi_logloss: 0.314716
[19]	valid_0's multi_logloss: 0.301914
[20]	valid_0's multi_logloss: 0.292755
[21]	valid_0's multi_logloss: 0.284754
[22]	valid_0's multi_logloss: 0.276745
[23]	valid_0's multi_logloss: 0.270387
[24]	valid_0's multi_logloss: 0.265765
[25]	valid_0's multi_logloss: 0.260089
[26]	valid_0's multi_logloss: 0.256178

In [137]:
print(classification_report(y_test_encoded, lgb_pred))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94       496
           1       0.93      0.90      0.92       471
           2       0.95      0.92      0.94       420
           3       0.92      0.82      0.87       491
           4       0.85      0.94      0.89       532
           5       1.00      1.00      1.00       537

    accuracy                           0.93      2947
   macro avg       0.93      0.92      0.93      2947
weighted avg       0.93      0.93      0.93      2947

