# 決定係数，調整済み決定係数

* 1変数を使用した単回帰分析では相関係数を二乗した値が決定係数になる

In [6]:
from sklearn.metrics import r2_score

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import lightgbm as lgbm
from sklearn.datasets import fetch_california_housing
import pandas as pd

In [3]:
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
target = pd.Series(california.target, name='MedHouseVal')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=0.2, random_state=3655)

In [8]:
# 線形回帰の予測
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


# 決定係数
r2 = r2_score(y_test, y_pred)
print('R-squared: ', r2)

# 自由度調整済み決定係数
n = X_test.shape[0]  # テストデータのサンプル数
p = X_test.shape[1]  # テストデータの特徴量の数

adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print('Adjusted R-squared: ', adjusted_r2)

R-squared:  0.6144486452645868
Adjusted R-squared:  0.6136998201036536


In [9]:
# LightGBMの予測
train_data = lgbm.Dataset(X_train, label=y_train)

params = {
    'objective': 'regression',
    'metric': 'rmse',
}
model = lgbm.train(params, train_data)
y_pred = model.predict(X_test)

# 決定係数
r2 = r2_score(y_test, y_pred)
print('R-squared: ', r2)

# 自由度調整済み決定係数
n = X_test.shape[0]  # テストデータのサンプル数
p = X_test.shape[1]  # テストデータの特徴量の数

adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
print('Adjusted R-squared: ', adjusted_r2)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 8
[LightGBM] [Info] Start training from score 2.064059
R-squared:  0.8402831355080453
Adjusted R-squared:  0.839972930381574
