In [1]:
!conda install -c conda-forge lightgbm

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - lightgbm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lightgbm-2.3.1             |   py37he1b5a44_0         1.0 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.0 MB

The following NEW packages will be INSTALLED:

  lightgbm           conda-forge/linux-64::lightgbm-2.3.1-py37he1b5a44_0


Proceed ([y]/n)? ^C

CondaSystemExit: 
Operation aborted.  Exiting.



In [17]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from lightgbm import LGBMClassifier

In [18]:
#import and split the data

In [19]:
diabetes = pd.read_csv("../input/diabetes/diabetes.csv")
df = diabetes.copy()
df = df.dropna()
y = df["Outcome"]
X = df.drop(['Outcome'], axis=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

In [5]:
#set and fit the model

In [6]:
lgbm_model=LGBMClassifier().fit(X_train,y_train)

In [7]:
#test error without tuning

In [8]:
y_pred=lgbm_model.predict(X_test)
accuracy_score(y_pred,y_test)

0.7272727272727273

In [9]:
#model tuning

In [10]:
#important parameters
#n_estimators
#subsample
#max_depth
#learning_rate
#min_child_samples  : minimum number of data needed in leaf node

In [11]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [12]:
lgbm_cv= GridSearchCV(lgbm_model,lgbm_params, cv=10,n_jobs=-1, verbose=2)
lgbm_cv.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 222 tasks      | elapsed:   17.8s
[Parallel(n_jobs=-1)]: Done 493 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done 860 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1337 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 1976 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 2529 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 3244 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 4065 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done 5046 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 5760 out of 5760 | elapsed: 11.7min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31, objective=None,
                                      random_state=None, reg_alpha=0.0,
                                      reg_lambda=0.0, silent=True,
                                      subsample=1.0, subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
                         'ma

In [20]:
lgbm_cv.best_params_

{'learning_rate': 0.01,
 'max_depth': 3,
 'min_child_samples': 20,
 'n_estimators': 500,
 'subsample': 0.6}

In [14]:
lgbm_tuned_model= LGBMClassifier( learning_rate= 0.01,
 max_depth= 3,
 min_child_samples=20,
 n_estimators=500,
 subsample=0.6). fit(X_train,y_train)

In [15]:
y_pred=lgbm_tuned_model.predict(X_test)
accuracy_score(y_pred,y_test)

0.7445887445887446

In [16]:
# We found 0.766 by Logistic Regression
#          0.775 by Naive Bayes 
#          0.731 by KNN
#          0.744 by Linear SVC
#          0.735 by Nonlinear SVC Steps
#          0.735  by ANN
#          0.753 by CART
#          0.735 by Random Forest
#          0.735 by GBM
#          0.757 by XG Boost
#And now,  0.744 by Light GBM