In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# import dataset
filename = '/kaggle/input/body-performance-data/bodyPerformance.csv'
df = pd.read_csv(filename)
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [3]:
df.select_dtypes(include=['object']).columns

Index(['gender', 'class'], dtype='object')

In [4]:
df.select_dtypes(include=['int','float']).columns

Index(['age', 'height_cm', 'weight_kg', 'body fat_%', 'diastolic', 'systolic',
       'gripForce', 'sit and bend forward_cm', 'sit-ups counts',
       'broad jump_cm'],
      dtype='object')

In [5]:
(df.isna().sum()/df.isna().count() * 100).sort_values(ascending= False).head(25)

age                        0.0
gender                     0.0
height_cm                  0.0
weight_kg                  0.0
body fat_%                 0.0
diastolic                  0.0
systolic                   0.0
gripForce                  0.0
sit and bend forward_cm    0.0
sit-ups counts             0.0
broad jump_cm              0.0
class                      0.0
dtype: float64

In [6]:
category_features = ['gender']

numeric_features = ['age', 'height_cm', 'weight_kg', 'body fat_%', 'diastolic', 'systolic',
       'gripForce', 'sit and bend forward_cm', 'sit-ups counts',
       'broad jump_cm']

target = ['class']

In [7]:
print('category feature number = ', len(category_features))
print('numeric feature number = ', len(numeric_features))

category feature number =  1
numeric feature number =  10


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=target), df[target], test_size = 0.2, random_state= 14)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [11]:
numeric_transformer = Pipeline(steps =[('imputer_numeric', SimpleImputer(missing_values=np.nan, strategy='mean')),
                                       ('scaler', StandardScaler())])

In [12]:
category_transformer = Pipeline(steps=[('imputer_category', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [13]:
preprocessor = ColumnTransformer(
    transformers=[
        ('category_transformer', category_transformer, category_features),
        ('numeric_transfomer', numeric_transformer, numeric_features)
    ], remainder='drop'
)

In [14]:
import optuna
from sklearn import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [15]:
# pipeline_rf = make_pipeline(clone(preprocessor), RandomForestClassifier())
# pipeline_rf = make_pipeline(clone(preprocessor), XGBClassifier())
pipeline_rf = make_pipeline(clone(preprocessor), LGBMClassifier())

In [16]:
def objective(trail):
#     parameter for the random forest classfication    
#     params = {
#         'randomforestclassifier__n_estimators': trail.suggest_int('randomforestclassifier__n_estimators', 10, 20, log=True),
#         'randomforestclassifier__max_depth': trail.suggest_int("randomforestclassifier__max_depth", 3, 10, log=True) ,
#         'randomforestclassifier__max_features': trail.suggest_categorical('randomforestclassifier__max_features', ['auto', 'sqrt']),
#         'randomforestclassifier__min_samples_split': trail.suggest_int("randomforestclassifier__min_samples_split", 2, 10, log=True) ,
#         'randomforestclassifier__min_samples_leaf': trail.suggest_int("randomforestclassifier__min_samples_leaf", 1, 4, log=True),
#         'randomforestclassifier__bootstrap': trail.suggest_categorical('randomforestclassifier__bootstrap', [True, False])
#     }
    
#     params = {
#         'xgbclassifier__n_estimators': trail.suggest_int('xgbclassifier__n_estimators', 10, 20, log=True),
#         'xgbclassifier__max_depth': trail.suggest_int("xgbclassifier__max_depth", 3, 20, log=True) ,
#         'xgbclassifier__eta': trail.suggest_float('xgbclassifier__eta', 0.1, 0.3, log=True),
#         'xgbclassifier__subsample': trail.suggest_float("xgbclassifier__subsample", 0.4, 0.8, log=True) ,
#         'xgbclassifier__colsample_bytree': trail.suggest_float("xgbclassifier__colsample_bytree", 0.4, 0.8, log=True),
# #     }

    params = {
        'lgbmclassifier__learning_rate': trail.suggest_float('lgbmclassifier__learning_rate', 0.1, 1.0, log=True),
        'lgbmclassifier__boosting_type"': trail.suggest_categorical("lgbmclassifier__boosting_type", ['gbdt', 'dart', 'goss']) ,
        'lgbmclassifier__sub_feature': trail.suggest_float('lgbmclassifier__sub_feature', 0.1, 1.0, log=True),
        'lgbmclassifier__num_leaves': trail.suggest_int("lgbmclassifier__num_leaves", 10, 20, log=True)
    }
    
    model = pipeline_rf.set_params(**params)
    model.fit(X_train, y_train)
    
    
    score = cross_val_score(model, X_train, y_train, scoring = 'accuracy', n_jobs = -1, cv = 3).mean()
    
    return score

In [17]:
pipeline_rf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'columntransformer', 'lgbmclassifier', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransformer__transformer_weights', 'columntransformer__transformers', 'columntransformer__verbose', 'columntransformer__category_transformer', 'columntransformer__numeric_transfomer', 'columntransformer__category_transformer__memory', 'columntransformer__category_transformer__steps', 'columntransformer__category_transformer__verbose', 'columntransformer__category_transformer__imputer_category', 'columntransformer__category_transformer__onehot', 'columntransformer__category_transformer__imputer_category__add_indicator', 'columntransformer__category_transformer__imputer_category__copy', 'columntransformer__category_transformer__imputer_category__fill_value', 'columntransformer__category_transformer__imputer_category__missing_values', 'columntransformer__category_transformer__imputer_category__strategy', 

In [18]:
study_rfr = optuna.create_study(direction = "maximize")
study_rfr.optimize(objective, n_trials = 5)


[32m[I 2022-03-12 08:01:30,618][0m A new study created in memory with name: no-name-e7715a23-443d-4971-bea4-cdd4606d8c6d[0m
  return f(**kwargs)




[32m[I 2022-03-12 08:01:34,498][0m Trial 0 finished with value: 0.7260597139517885 and parameters: {'lgbmclassifier__learning_rate': 0.36734285611976747, 'lgbmclassifier__boosting_type': 'goss', 'lgbmclassifier__sub_feature': 0.3611947243920232, 'lgbmclassifier__num_leaves': 10}. Best is trial 0 with value: 0.7260597139517885.[0m
  return f(**kwargs)
[32m[I 2022-03-12 08:01:37,310][0m Trial 1 finished with value: 0.7243798259150561 and parameters: {'lgbmclassifier__learning_rate': 0.40524707222597, 'lgbmclassifier__boosting_type': 'goss', 'lgbmclassifier__sub_feature': 0.7795448393151112, 'lgbmclassifier__num_leaves': 16}. Best is trial 0 with value: 0.7260597139517885.[0m
  return f(**kwargs)
[32m[I 2022-03-12 08:01:38,076][0m Trial 2 finished with value: 0.4889868344485026 and parameters: {'lgbmclassifier__learning_rate': 0.5926501790241678, 'lgbmclassifier__boosting_type': 'gbdt', 'lgbmclassifier__sub_feature': 0.12220621292027992, 'lgbmclassifier__num_leaves': 12}. Best is 

In [19]:
print(study_rfr.best_trial)

FrozenTrial(number=3, values=[0.7396873888397776], datetime_start=datetime.datetime(2022, 3, 12, 8, 1, 38, 77509), datetime_complete=datetime.datetime(2022, 3, 12, 8, 1, 39, 222431), params={'lgbmclassifier__learning_rate': 0.11765661503351789, 'lgbmclassifier__boosting_type': 'goss', 'lgbmclassifier__sub_feature': 0.7225985574581698, 'lgbmclassifier__num_leaves': 13}, distributions={'lgbmclassifier__learning_rate': LogUniformDistribution(high=1.0, low=0.1), 'lgbmclassifier__boosting_type': CategoricalDistribution(choices=('gbdt', 'dart', 'goss')), 'lgbmclassifier__sub_feature': LogUniformDistribution(high=1.0, low=0.1), 'lgbmclassifier__num_leaves': IntLogUniformDistribution(high=20, low=10, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=3, state=TrialState.COMPLETE, value=None)


In [20]:
print(study_rfr.best_params)

{'lgbmclassifier__learning_rate': 0.11765661503351789, 'lgbmclassifier__boosting_type': 'goss', 'lgbmclassifier__sub_feature': 0.7225985574581698, 'lgbmclassifier__num_leaves': 13}


In [21]:
print(study_rfr.best_value)

0.7396873888397776


In [22]:
pipeline_rf.set_params(**study_rfr.best_params)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('category_transformer',
                                                  Pipeline(steps=[('imputer_category',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['gender']),
                                                 ('numeric_transfomer',
                                                  Pipeline(steps=[('imputer_numeric',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                  

In [23]:
pipeline_rf.fit(X_train, y_train)

  return f(**kwargs)


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('category_transformer',
                                                  Pipeline(steps=[('imputer_category',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['gender']),
                                                 ('numeric_transfomer',
                                                  Pipeline(steps=[('imputer_numeric',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                  

In [24]:
y_train_pred = pipeline_rf.predict(X_train)
y_test_pred = pipeline_rf.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))

[[2442  213   27    7]
 [ 470 1935  259   36]
 [ 174  410 1976   88]
 [  32  123  216 2306]]
[[532 102  23   2]
 [140 380 109  18]
 [ 64 132 461  44]
 [ 11  39  73 549]]


In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_train, y_train_pred))
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           A       0.78      0.91      0.84      2689
           B       0.72      0.72      0.72      2700
           C       0.80      0.75      0.77      2648
           D       0.95      0.86      0.90      2677

    accuracy                           0.81     10714
   macro avg       0.81      0.81      0.81     10714
weighted avg       0.81      0.81      0.81     10714

              precision    recall  f1-score   support

           A       0.71      0.81      0.76       659
           B       0.58      0.59      0.58       647
           C       0.69      0.66      0.67       701
           D       0.90      0.82      0.85       672

    accuracy                           0.72      2679
   macro avg       0.72      0.72      0.72      2679
weighted avg       0.72      0.72      0.72      2679

