In [1]:
# Light GBM
# LightGBM is a gradient-boosting framework based on decision trees to increase the efficiency of the model and reduces 
# memory usage. 
# It uses two novel techniques:

# Gradient-based One Side Sampling(GOSS) 
# Exclusive Feature Bundling (EFB)
# These techniques fulfill the limitations of the histogram-based algorithm that is primarily used in all GBDT 
# (Gradient Boosting Decision Tree) frameworks. The two techniques of GOSS and EFB described below form 
# the characteristics of the LightGBM Algorithm. They comprise together to make the model work efficiently 
# and provide it a cutting edge over other GBDT frameworks 

In [2]:
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, ShuffleSplit
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import BaggingRegressor

from warnings import filterwarnings

from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv("C:/Users/Derya/Downloads/Team_Basketball.csv")
df = data.copy()
df = df.dropna()
dms = pd.get_dummies(df[["League","Division","NewLeague"]])
y = df["Salary"]
X_ = df.drop(["Salary","League","Division","NewLeague"], axis=1).astype("float64")
X = pd.concat([X_,dms[["League_N","Division_W","NewLeague_N"]]], axis=1)
X_train, X_test,y_train, y_test = train_test_split(X,y, test_size=0.25,random_state=42)

In [4]:
from lightgbm import LGBMRegressor

In [5]:
lgbm = LGBMRegressor()
lgbm_model = lgbm.fit(X_train, y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 831
[LightGBM] [Info] Number of data points in the train set: 197, number of used features: 19
[LightGBM] [Info] Start training from score 543.483442


found 0 physical cores < 1
  File "C:\Users\Derya\anaconda3\lib\site-packages\joblib\externals\loky\backend\context.py", line 245, in _count_physical_cores
    raise ValueError(


In [6]:
y_pred = lgbm_model.predict(X_test, 
                            num_iteration = lgbm_model.best_iteration_)

In [7]:
np.sqrt(mean_squared_error(y_test, y_pred))

363.8712087611089

In [8]:
lgbm_grid = {
    'colsample_bytree': [0.4, 0.5,0.6,0.9,1],
    'learning_rate': [0.01, 0.1, 0.5,1],
    'n_estimators': [20, 40, 100, 200, 500,1000],
    'max_depth': [1,2,3,4,5,6,7,8] }

In [9]:
lgbm = LGBMRegressor()
lgbm_cv_model = GridSearchCV(lgbm, lgbm_grid, cv=10, n_jobs = -1, verbose = 2)

In [None]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 960 candidates, totalling 9600 fits


In [None]:
lgbm_tuned = LGBMRegressor(learning_rate = 0.1, 
                           max_depth = 7, 
                           n_estimators = 40,
                          colsample_bytree = 0.6)

lgbm_tuned = lgbm_tuned.fit(X_train,y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))