In [17]:
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xg
import lightgbm as lg
import pandas as pd
import joblib

In [3]:
df=pd.read_csv("Multiclass Diabetes Dataset.csv")
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0
3,0,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,0
4,0,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  264 non-null    int64  
 1   AGE     264 non-null    int64  
 2   Urea    264 non-null    float64
 3   Cr      264 non-null    int64  
 4   HbA1c   264 non-null    float64
 5   Chol    264 non-null    float64
 6   TG      264 non-null    float64
 7   HDL     264 non-null    float64
 8   LDL     264 non-null    float64
 9   VLDL    264 non-null    float64
 10  BMI     264 non-null    float64
 11  Class   264 non-null    int64  
dtypes: float64(8), int64(4)
memory usage: 24.9 KB


In [5]:
params_grid = [
    {
        "model": OneVsRestClassifier(GaussianNB()),  
        "params": {
            "estimator__var_smoothing": [1e-11, 1e-10, 1e-9, 1e-8, 1e-7],
            "estimator__priors": [None]
        }
    },
    
    {
        "model": OneVsRestClassifier(LogisticRegression()), 
        "params": {
            "estimator__penalty": ["l2"],
            "estimator__tol": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
            "estimator__C": [ 0.8, 0.6, 0.4, 0.2],
            "estimator__fit_intercept": [True, False],
            "estimator__max_iter": [100, 200, 300,400,500],
            "estimator__solver": ['lbfgs', 'newton-cg', 'newton-cholesky'],
        }
    },
    
    {
        "model": OneVsRestClassifier(SGDClassifier()), 
        "params": {
            "estimator__penalty": ['elasticnet', 'l1', 'l2'],
            "estimator__alpha": [1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
            "estimator__fit_intercept": [True, False],
            "estimator__max_iter": [500, 1000, 1500],
            "estimator__tol": [1e-2, 1e-3],
            "estimator__learning_rate": ['adaptive', 'constant', 'invscaling', 'optimal'],
            "estimator__eta0": [0.2, 0.4, 0.6, 0.8],
            "estimator__early_stopping": [True, False]
        }
    },
    
    {
        "model": OneVsRestClassifier(SVC()),  
        "params": {
            "estimator__max_iter":  [-1, 1000, 2000] ,
            "estimator__tol": [ 1e-3, 1e-4],
            "estimator__C": [1.0, 0.8, 0.6, 0.4, 0.2],
            "estimator__gamma": ['scale', 'auto']
        }
    },

    {
        "model": DecisionTreeClassifier(),  
        "params": {
            "criterion": ["gini", "entropy", "log_loss"],
            "splitter": ["best", "random"],
            "max_depth": [None, 1, 5, 10, 15, 20, 25, 30],
            "max_features": [None, 1, 2, 3, 4, 5]
        }
    },
    
    {
        "model": AdaBoostClassifier(),  
        "params": {
            "n_estimators": [50, 55, 60, 65, 70, 75],  
            "learning_rate": [1, 0.8, 0.6, 0.4, 0.2]
        }
    },
    
    {
        "model": GradientBoostingClassifier(), 
        "params": {
            "learning_rate": [0.1, 0.3, 0.5, 0.7, 0.9],
            "n_estimators": [100, 110, 120, 130, 140, 150],
            "max_depth":  [1, 5, 10, 15, 20, 25, 30], 
            "criterion": ['friedman_mse', 'squared_error'],
            "tol": [ 1e-3, 1e-4],
            "subsample": [1.0, 0.8, 0.6, 0.4, 0.2]
        }
    },
    
    {
        "model": xg.XGBClassifier(), 
        "params": {
            "n_estimators": [10, 20, 30, 40, 50],
            "max_depth": [3, 5, 7, 9],
            "learning_rate": [1e-4, 1e-3, 1e-2, 0.1, 1.0]
        }
    },
    
    {
        "model": lg.LGBMClassifier(), 
        "params": {
            "n_estimators": [10, 20, 30, 40, 50],
            "num_leaves": [5, 10, 15, 20, 25],
            "learning_rate": [1e-4, 1e-3, 1e-2,1e-5, 1]
        }
    }
]


In [6]:
x=df[['Gender', 'AGE', 'Urea', 'Cr', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL',
       'VLDL', 'BMI']]
y=df["Class"]

In [7]:
ss=StandardScaler()
x=ss.fit_transform(x)
x

array([[-1.09544512,  0.04721685, -0.24316765, ..., -1.13282337,
        -0.31647483, -0.51669126],
       [ 0.91287093, -2.32711614, -0.29322711, ..., -0.431615  ,
        -0.28415399, -0.71338694],
       [ 0.91287093, -1.63460235,  0.35754594, ..., -0.53178762,
        -0.34879566, -1.10677832],
       ...,
       [ 0.91287093,  1.03652226,  3.78661933, ..., -1.63368649,
        -0.31647483,  1.25356992],
       [ 0.91287093,  0.6408001 ,  3.78661933, ..., -0.0309245 ,
        -0.18719149,  1.6469613 ],
       [ 0.91287093,  0.83866118,  3.78661933, ...,  1.77218273,
        -0.05790816,  1.25356992]], shape=(264, 11))

In [8]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [11]:
res_list=[]
for line in params_grid:
    gs=RandomizedSearchCV(estimator=line["model"],cv=5,scoring="accuracy",return_train_score=True,param_distributions=line["params"],error_score='raise')
    res=gs.fit(x_train,y_train)
    res_list.append({
                    "best_estimator":res.best_estimator_,
                    "best_params":res.best_params_,
                    "best_score":res.best_score_,
                    "train_scores": res.cv_results_['mean_train_score'], 
                    "test_scores": res.cv_results_['mean_test_score'],
                    "y_pred": gs.predict(x_test)    
    })        
    
    



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 168, number of used features: 11
[LightGBM] [Info] Start training from score -0.980829
[LightGBM] [Info] Start training from score -1.905088
[LightGBM] [Info] Start training from score -0.741937
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 11
[LightGBM] [Info] Start training from score -1.002764
[LightGBM] [Info] Start training from score -1.871802
[LightGBM] [Info] Start training from score -0.735450
[LightGBM] [Info] Auto-choosing col-wise



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 310
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 11
[LightGBM] [Info] Start training from score -1.002764
[LightGBM] [Info] Start training from score -1.871802
[LightGBM] [Info] Start training from score -0.735450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 312
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 11
[LightGBM] [Info] Start training from score -1.002764
[LightGBM] [Info] Start training from score -1.871802
[LightGBM] [Info] Start training from score -0.735450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 11
[LightGBM] [Info] Start training from score -1.002764
[LightGBM] [Info] Start training from score -1.871802
[LightGBM] [Info] Start training from score -0.735450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 311
[LightGBM] [Info] Number of data points in the train set: 169, number of used features: 11
[LightGBM] [Info] Start training from score -0.986764
[LightGBM] [Info] Start training from score -1.911023
[LightGBM] [Info] Start training from score -0.735450
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000028 seconds.




In [14]:
max=res_list[0]["best_score"]
pos_max=0
for row in range(1,len(res_list)):
    if (max<res_list[row]["best_score"]) and (res_list[row]["best_score"]<98):
        max=res_list[row]["best_score"]
        pos_max=row


In [15]:
print("model:",res_list[pos_max]["best_estimator"])
print("params:",res_list[pos_max]["best_params"])
print("best_score:",max)
print("best_estimator:",res_list[pos_max]["best_estimator"])
print("train_scores:",res_list[pos_max]["train_scores"][0])
print("test_scores:",res_list[pos_max]["test_scores"][0])
print("y_pred:",res_list[pos_max]["y_pred"])

model: LGBMClassifier(learning_rate=1, n_estimators=40, num_leaves=10)
params: {'num_leaves': 10, 'n_estimators': 40, 'learning_rate': 1}
best_score: 0.9765227021040974
best_estimator: LGBMClassifier(learning_rate=1, n_estimators=40, num_leaves=10)
train_scores: 0.47867004790081713
test_scores: 0.4786267995570321
y_pred: [0 2 0 2 1 1 1 2 0 2 2 2 2 0 2 1 2 2 0 2 0 0 0 0 2 1 0 2 2 1 2 1 2 2 0 0 2
 0 0 2 0 2 2 0 1 2 2 2 0 2 2 0 0]


In [18]:
joblib.dump(res_list[pos_max]["best_estimator"],"model.pkl")

['model.pkl']