In [1]:
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,KFold, RandomizedSearchCV,StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.preprocessing import StandardScaler
from pycaret.classification import *
#import xgboost as xgb
from xgboost import XGBClassifier,XGBRFClassifier
from tune_sklearn import TuneSearchCV,TuneGridSearchCV

from data_prepare_func import convert_to_array

In [2]:
x_kit,y_kit = convert_to_array("data_fr_kittinan/",size=28)
x_diy,y_diy = convert_to_array('data_writing_diy/',size=28)

In [3]:
x = np.append(x_kit,x_diy,axis=0)

y = np.append(y_kit,y_diy,axis=0)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42,stratify=y)

In [5]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [6]:
xgb = XGBClassifier()
xgbrf = XGBRFClassifier()

#### XGB

In [10]:
xgb = XGBClassifier(objective="multi:softprob", random_state=59)
xgb.fit(x_train,y_train)

y_pred = xgb.predict(x_train)
print(confusion_matrix(y_train, y_pred))

[[185   0   0   0   0   0   0   0   0   0]
 [  0 185   0   0   0   0   0   0   0   0]
 [  0   0 187   0   0   0   0   0   0   0]
 [  0   0   0 187   0   0   1   0   0   0]
 [  0   0   0   1 183   0   1   0   0   0]
 [  0   0   0   1   0 184   0   0   0   0]
 [  0   0   0   0   0   0 185   0   0   0]
 [  0   0   0   0   0   0   0 185   0   0]
 [  0   0   0   0   0   0   1   0 183   0]
 [  0   0   0   0   1   0   0   0   0 189]]


In [6]:
xgb.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': 59,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [7]:
xgb = XGBClassifier(n_estimators=100)
scores = cross_val_score(xgb, x_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.8125     0.83522727 0.85795455 0.90285714 0.86857143 0.86857143
 0.82285714 0.86857143 0.87428571 0.94285714]
Mean: 0.8654253246753246
Standard Deviation: 0.036174442458010554


In [11]:
xgb.fit(x_train,y_train)

train_yhat = xgb.predict(x_train)
train_accuracy = accuracy_score(train_yhat,y_train)

test_yhat = xgb.predict(x_test)
test_accuracy = accuracy_score(test_yhat,y_test)

print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test,test_yhat))
print(f"Accuracy Score Train: {train_accuracy}\nAccuracy Score Test: {test_accuracy}")

Train Score
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       185
           1       1.00      1.00      1.00       185
           2       1.00      1.00      1.00       187
           3       0.99      0.99      0.99       188
           4       0.99      0.99      0.99       185
           5       1.00      0.99      1.00       185
           6       0.98      1.00      0.99       185
           7       1.00      1.00      1.00       185
           8       1.00      0.99      1.00       184
           9       1.00      0.99      1.00       190

    accuracy                           1.00      1859
   macro avg       1.00      1.00      1.00      1859
weighted avg       1.00      1.00      1.00      1859

Test Score
               precision    recall  f1-score   support

           0       0.97      0.76      0.85        79
           1       0.81      0.94      0.87        80
           2       0.90      0.70      0.79        80

In [9]:
parameters = {
    'max_depth' : [ i for i in range(1,11,1)],
}
tune = GridSearchCV(XGBRFClassifier(),cv=5,param_grid=parameters,n_jobs=-1,scoring='accuracy')

In [10]:
tune.fit(x_train, y_train)
print(tune.best_params_)

{'max_depth': 10}


In [11]:
parameters = {
    'learning_rate': [0.01,0.05,0.1,0.2],
}
tune = GridSearchCV(XGBRFClassifier(),cv=5,param_grid=parameters,n_jobs=-1,scoring='accuracy')

In [12]:
tune.fit(x_train, y_train)
print(tune.best_params_)

{'learning_rate': 0.01}


In [13]:
parameters = {
    'gamma': [0, 0.5, 1],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [0.5, 1, 5],
}
tune = GridSearchCV(XGBRFClassifier(),cv=5,param_grid=parameters,n_jobs=-1,scoring='accuracy')

In [14]:
tune.fit(x_train, y_train)
print(tune.best_params_)

{'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 0.5}


In [15]:
parameters = {
    'subsample' : [0.5,0.7,0.9],
    'colsample_bytree': [0.5,0.7,0.9],
}
tune = GridSearchCV(XGBRFClassifier(),cv=5,param_grid=parameters,n_jobs=-1,scoring='accuracy')

In [16]:
tune.fit(x_train, y_train)
print(tune.best_params_)

{'colsample_bytree': 0.5, 'subsample': 0.7}


In [17]:
parameters = {
    'subsample' : [0.5,0.7,0.9],
    'colsample_bytree': [0.5,0.7,0.9],
    'tree_method': ['approx','hist','gpu_hist']
}
tune = GridSearchCV(XGBRFClassifier(),cv=5,param_grid=parameters,n_jobs=-1,scoring='accuracy')

In [18]:
tune.fit(x_train, y_train)
print(tune.best_params_)

{'colsample_bytree': 0.5, 'subsample': 0.9, 'tree_method': 'hist'}


In [32]:
xgb = XGBClassifier(max_depth=4,learning_rate=0.001,gamma=0,reg_alpha=1,reg_lambda=1,colsample_bytree=0.5,
        subsample=0.7,tree_method='hist',n_estimators=140,eval_metric='merror',
        colsample_bylevel=0.5,colsample_bynode=0.5)

In [33]:
xgb.fit(x_train,y_train)

train_yhat = xgb.predict(x_train)
train_accuracy = accuracy_score(train_yhat,y_train)

test_yhat = xgb.predict(x_test)
test_accuracy = accuracy_score(test_yhat,y_test)

print('Train Score\n',classification_report(y_train,train_yhat))
print('Test Score\n',classification_report(y_test,test_yhat))
print(f"Accuracy Score Train: {train_accuracy}\nAccuracy Score Test: {test_accuracy}")

Train Score
               precision    recall  f1-score   support

           0       0.91      0.94      0.92       185
           1       0.98      0.87      0.92       185
           2       0.94      0.82      0.88       187
           3       0.90      0.93      0.91       188
           4       0.82      0.63      0.71       185
           5       0.69      0.82      0.75       185
           6       0.90      0.82      0.86       185
           7       0.82      0.82      0.82       185
           8       0.79      0.65      0.72       184
           9       0.60      0.89      0.72       190

    accuracy                           0.82      1859
   macro avg       0.84      0.82      0.82      1859
weighted avg       0.84      0.82      0.82      1859

Test Score
               precision    recall  f1-score   support

           0       0.89      0.85      0.87        79
           1       0.89      0.90      0.89        80
           2       0.82      0.51      0.63        80

In [34]:
scores = cross_val_score(xgb, x_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.68817204 0.66129032 0.71505376 0.66666667 0.69892473 0.70430108
 0.70967742 0.71505376 0.66666667 0.68648649]
Mean: 0.691229293809939
Standard Deviation: 0.019597100744145397
