In [36]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from matplotlib import font_manager, rc

font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

# 월별

In [3]:
month_data = pd.read_csv('../pre_data/월별_통합_모델링용.csv', index_col=0)
month_data = month_data.drop(columns='date')
month_data

Unnamed: 0,닭고기가격,병아리(원/수),hot_day,import amount,feedprice,oil_avg,도축,소_소비자가_int,돼지_소비자가_int,미국기준금리,한국기준금리
0,3202,250,0,8754,533.6,107.066111,58873,59981,18465,0.25,3.25
2,3668,420,0,10098,529.9,112.537302,55879,56019,16634,0.75,3.25
3,4169,500,0,10836,527.9,117.745606,58750,50899,16317,0.25,3.25
4,3399,332,0,13412,526.3,113.723667,60940,53879,15882,0.25,3.25
5,3255,200,0,14519,526.2,104.184710,73505,55403,16798,0.25,3.25
...,...,...,...,...,...,...,...,...,...,...,...
117,3505,312,3,9360,492.0,69.247955,89892,99247,26078,0.25,0.75
118,3074,408,0,9009,503.0,72.975152,81026,103039,26346,0.25,0.75
119,3913,447,0,12005,508.0,82.193333,81228,110791,25977,0.25,0.75
120,2985,315,0,10025,520.0,79.858712,86819,107207,25217,0.25,1.00


In [4]:
# x: 닭고기가격을 제외한 모두
# y: 닭고기 가격

data_input = month_data.copy()
data_output = data_input.pop('닭고기가격')

In [5]:
train_input, test_input, train_target, test_target = train_test_split(data_input,data_output, test_size=0.2)

In [6]:
print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)

(96, 10) (96,)
(24, 10) (24,)


In [7]:
alpha_value = [0.01, 0.1, 1, 10, 100]

for alpha in alpha_value:
    ridge = Ridge(alpha=alpha)
    ridge.fit(train_input, train_target)

    scores = cross_val_score(ridge, train_input, train_target, cv=5)

    print(f"alpha : {alpha}")
    print(f"ridge train score : {ridge.score(train_input, train_target)}")
    print(f"ridge test score : {ridge.score(test_input, test_target)}")
    print('\n')

alpha : 0.01
ridge train score : 0.5826052624565632
ridge test score : 0.6422385390209688


alpha : 0.1
ridge train score : 0.5825921953472861
ridge test score : 0.6418779070551772


alpha : 1
ridge train score : 0.5819599627945523
ridge test score : 0.6387472520051757


alpha : 10
ridge train score : 0.5783339871130532
ridge test score : 0.6319977370268353


alpha : 100
ridge train score : 0.5733412132655091
ridge test score : 0.651038136920415




In [8]:
ridge.predict(test_input)

array([3828.96791408, 3617.08121085, 3665.46912357, 3331.52865838,
       3210.69382245, 2799.06516569, 3265.23269671, 3528.53937008,
       2981.58845725, 3163.22424275, 2702.6767639 , 3742.83139764,
       3617.93443542, 3165.77300713, 3449.08339365, 2610.20322397,
       2868.63960944, 2795.7473461 , 3168.50673245, 3964.22176689,
       3354.70044161, 3321.74296868, 3213.73827803, 2976.08086456])

In [9]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

ridge_reg = Ridge()
pipe = make_pipeline(RobustScaler(), ridge_reg)
scores = cross_validate(ridge_reg, data_input, data_output, cv = 5, scoring='neg_mean_squared_error',return_train_score=True)
print("MSLE: {0:.3f}".format(np.mean(-scores['test_score'])))

MSLE: 179202.434


In [10]:
print(format(scores))

{'fit_time': array([0.00198507, 0.00303698, 0.00200152, 0.00448489, 0.00099754]), 'score_time': array([0.00285745, 0.00099468, 0.00199986, 0.00228214, 0.00100183]), 'test_score': array([ -65319.16606824, -137809.88955025, -141799.83987633,
        -93758.07956036, -457325.19587644]), 'train_score': array([-81952.39408137, -69650.79233158, -62330.65059222, -76119.21910547,
       -72490.99784189])}


In [11]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

#릿지 하이퍼 파라미터 튜닝
pipeline = Pipeline([('scaler', RobustScaler()), ('ridge',Ridge())])
params={'ridge__alpha':[0.01, 0.1, 1, 10, 100]}
grid_model = GridSearchCV(pipeline, param_grid=params, scoring='neg_mean_squared_error', cv=5)
grid_model.fit(data_input, data_output)
print("MSLE: {0:.3f}".format( -1*grid_model.best_score_)) 
print('optimal hyperparameter: ', grid_model.best_params_)

MSLE: 125094.774
optimal hyperparameter:  {'ridge__alpha': 10}


In [12]:
from sklearn.preprocessing import MinMaxScaler

#릿지에 대해 하이퍼 파라미터 튜닝
pipeline = Pipeline([('scaler', MinMaxScaler()), ('ridge',Ridge())])
params={'ridge__alpha':[0.01, 0.1, 1, 10, 100]}
grid_model = GridSearchCV(pipeline, param_grid=params, scoring='neg_mean_squared_error', cv=5)
grid_model.fit(data_input, data_output)
print("MSLE: {0:.3f}".format( -1*grid_model.best_score_)) 
print('optimal hyperparameter: ', grid_model.best_params_)

MSLE: 118738.462
optimal hyperparameter:  {'ridge__alpha': 1}


# 일별

In [29]:
data_daily = pd.read_csv('../pre_data/일별_통합_모델링용.csv', index_col=0)
data_daily = data_daily.drop(columns=['date', '한국기준금리'])
data_daily

Unnamed: 0,price,병아리(원/수),oil_avg,소_소비자가_int,돼지_소비자가_int,미국기준금리
0,4047,400,76.703333,111396,25177,0.25
1,4047,400,77.850000,111396,25411,0.25
2,3895,400,77.566667,112019,26891,0.25
3,3895,400,77.130000,110865,26936,0.25
4,3743,400,76.530000,109225,26993,0.25
...,...,...,...,...,...,...
2991,3166,200,108.180000,58380,19570,0.75
2992,3118,200,108.260000,58870,19830,0.75
2993,3118,300,108.470000,61700,20300,0.75
2994,3118,300,107.000000,61030,20330,0.75


In [30]:
data_input = data_daily.copy()
data_output = data_input.pop('price')

train_input, test_input, train_target, test_target = train_test_split(data_input,data_output, test_size=0.2)

print(train_input.shape, train_target.shape)
print(test_input.shape, test_target.shape)

(2396, 5) (2396,)
(600, 5) (600,)


In [31]:
ridge = Ridge()
ridge.fit(train_input, train_target)
ridge.score(test_input, test_target)

0.3295711531664989

In [32]:
alpha_value = [0.01, 0.1, 1, 10, 100]

for alpha in alpha_value:
    ridge = Ridge(alpha=alpha)
    ridge.fit(train_input, train_target)

    scores = cross_val_score(ridge, train_input, train_target, cv=5)

    print(f"alpha : {alpha}")
    print(f"ridge train score : {ridge.score(train_input, train_target)}")
    print(f"ridge test score : {ridge.score(test_input, test_target)}")
    print('\n')

alpha : 0.01
ridge train score : 0.3805642112701991
ridge test score : 0.329574518848071


alpha : 0.1
ridge train score : 0.38056421106829863
ridge test score : 0.3295742142844782


alpha : 1
ridge train score : 0.3805641909102564
ridge test score : 0.3295711531664989


alpha : 10
ridge train score : 0.3805622066944856
ridge test score : 0.32953902272218194


alpha : 100
ridge train score : 0.38039169663480965
ridge test score : 0.3290912487651364




In [37]:
# 함수화 작업
def ridge_regression(data, target_col, plot=True):
    # 라이브러리 호출 

    # feature, target 분리
    feature = data_daily.drop(columns='price')
    target = data_daily['price']

    # 훈련, 검정셋 분리
    X_train, X_test, Y_train, Y_test = train_test_split(feature, target, test_size = 0.2, random_state = 2022)

    # 빈 리스트 셍성
    train_score = []
    test_score = []
    r_squared = []
    intercept = []
    mse_train = []
    mse_test = []
    # 하이퍼파라미터 적용
    alpha_list = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    # 라소 회귀 요약 데이터 형성
    ridge_data = pd.DataFrame()

    for alpha in alpha_list:
        # 라소 회귀 시행
        ridge = Ridge(alpha= alpha, max_iter = 10000).fit(X_train, Y_train)
        # 결정계수 및 절편 리스트
        train_score.append(np.round(ridge.score(X_train, Y_train), 4))
        test_score.append(np.round(ridge.score(X_test, Y_test), 4))
        intercept.append(np.round(ridge.intercept_, 4))
        # 라소 회귀에서 도출된 각 feature의 계수
        ridge_data = pd.concat([ridge_data, pd.DataFrame(np.round(ridge.coef_, 4), ridge.feature_names_in_)], axis=1)
        # 훈련, 검정셋에서 y 값 예측 (기댓값 도출)
        Y_pred_train = ridge.predict(X_train)
        Y_pred = ridge.predict(X_test)
        # 훈련, 검정셋에서 MSE 도출 
        mse_train.append(np.round(mean_squared_error(Y_train,Y_pred_train),4))
        mse_test.append(np.round(mean_squared_error(Y_test,Y_pred),4))
        
        # 도식화 여부
        if plot == True:
            plt.subplot(2,1,1)
            plt.scatter(Y_train,Y_pred_train, label = '(실제값, 예측값)', alpha=.5)
            plt.plot(np.linspace(0,50,200), np.linspace(0,50,200), color = 'green', label = "45°(실제값 = 예측값)")
            plt.xlabel("실제 Price: $Y_i$")
            plt.ylabel("예측 Price : $\hat{Y}_i$")
            plt.title(f'ridge regression when a = {alpha}')
            plt.legend()
            plt.text(y=5,x=30, s=f'Train MSE 값 : {mean_squared_error(Y_train,Y_pred_train) : .4f}\nTrain Score 값 : {ridge.score(X_train,Y_train) : .4f}')
            plt.subplot(2,1,2)
            plt.scatter(Y_test,Y_pred, label = '(실제값, 예측값)', alpha=.5)
            plt.plot(np.linspace(0,50,200), np.linspace(0,50,200), color = 'green', label = "45°(실제값 = 예측값)")
            plt.xlabel("실제 Price: $Y_i$")
            plt.ylabel("예측 Price: $\hat{Y}_i$")
            plt.legend(loc='upper left')
            plt.text(y=5,x=30, s=f'Test MSE 값 : {mean_squared_error(Y_test,Y_pred) : .4f}\nTest Score 값 : {ridge.score(X_test,Y_test) : .4f}')
            plt.show()
        else:
            pass
    
    # 결정계수 출력
    plt.plot(np.log10(alpha_list), train_score, label ='$R^2$ of train')
    plt.plot(np.log10(alpha_list), test_score, label ='$R^2$ of test')
    plt.title('$R^2$ of ridge regreesion')
    plt.xlabel('log value of alpha')
    plt.ylabel('R-square')
    plt.legend()
    plt.show()
    
    # 라소 회귀 요약 데이터 형성
    ridge_data.columns = [f'alpha = {i}' for i in alpha_list]
    score = pd.DataFrame({'R^2 of Train' : train_score,
                        'R^2 of Test' : test_score,
                        'intercept' : intercept,
                        'mse_train' : mse_train,
                        'mse_test' : mse_test}).transpose()
    score.columns= [f'alpha = {i}' for i in alpha_list]
    ridge_summary = pd.concat([ridge_data, score], axis=0)

    return ridge_summary

# 데이터는 pd.DataFrame, concat을 사용하여 따로 data를 만들어 주었음
subx = pd.DataFrame(X, columns=data_daily.columns)
suby = pd.DataFrame(y, columns=data_daily['price'])

data = pd.concat([subx,suby], axis=1)

# 위의 함수 시행
ridge_regression(data,'target')

NameError: name 'X' is not defined