In [16]:
# data_3_lightGBM을 활용해 선형회귀, 릿지회귀, 라쏘회귀 수행
# 수행결과, 트리기반 모델에 비해 낮은 평가지표를 보임
# 모델 자체의 성능이 아니라 전처리된 데이터 셋이 해당 모델이 적합하지 않은것으로 판단
# 선형 회귀의 경우, 변수간의 독립성이 필요하지만, 트리기반 모델을 사용하면서 이를 고려치 않고 피쳐 생성
# 실제로, 변수 선택이 가능한 라쏘의 경우, 선형회귀나 릿지에 비해 상당히 평가지표가 개선된 것을 확인 가능
# PCA 분석으로 주성분을 결정해 동일 모델로 재수행한다면 더 좋은 결과를 예상

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse

In [2]:
data = pd.read_pickle('./data/data_3.pkl')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8029111 entries, 827196 to 8856306
Data columns (total 29 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   month_id                    int8   
 1   shop_id                     int8   
 2   item_id                     int16  
 3   cnt_month                   float32
 4   city_id                     int8   
 5   cat_id                      int8   
 6   main_cat                    int8   
 7   sub_cat                     int8   
 8   year                        int16  
 9   month                       int8   
 10  sales_month                 int8   
 11  cnt_month_1                 float32
 12  cnt_month_2                 float32
 13  cnt_month_3                 float32
 14  item_id_mean_cnt_1          float32
 15  item_id_mean_cnt_2          float32
 16  item_id_mean_cnt_3          float32
 17  sub_cat_mean_cnt_1          float32
 18  sub_cat_mean_cnt_2          float32
 19  sub_cat_mean_cnt

In [4]:
data[['cnt_month','cnt_month_1','cnt_month_2','cnt_month_3']]=data[['cnt_month','cnt_month_1','cnt_month_2','cnt_month_3']].clip(0,20)

In [5]:
# train/validation/test data split
X_train = data[data['month_id'] < 33].drop(['cnt_month'], axis=1)
y_train = data[data['month_id'] < 33]['cnt_month']

X_valid = data[data['month_id'] == 33].drop(['cnt_month'], axis=1)
y_valid = data[data['month_id'] == 33]['cnt_month']

X_test = data[data['month_id'] == 34].drop(['cnt_month'], axis=1)

In [9]:
# 선형회귀 모델을 초기화
lModel = LinearRegression()

# 모델을 학습시킨다.
y_train_log = np.log1p(y_train) #log +1 
y_valid_log = np.log1p(y_valid) #log +1 

lModel.fit(X_train, y_train_log)

# 예측하고 정확도를 평가한다.
preds = lModel.predict(X_valid)
print ("RMSE Value For Linear Regression: ",
       np.sqrt(mse(np.exp(y_valid_log),np.exp(preds))))

RMSE Value For Linear Regression:  1.3752245


In [12]:
ridge_m_ = Ridge(normalize=True)
ridge_params_ = { 'max_iter':[3000],'alpha':[0.01, 0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000]}
grid_ridge_m = GridSearchCV( ridge_m_,
                          ridge_params_,
                          scoring ='neg_root_mean_squared_error',
                          cv=5)

y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid) #log +1 

grid_ridge_m.fit( X_train, y_train_log )
preds = grid_ridge_m.predict(X_valid)
print (grid_ridge_m.best_params_)
print ("RMSE Value For Ridge Regression: ", np.sqrt(mse(np.exp(y_valid_log),np.exp(preds))))

df = pd.DataFrame(grid_ridge_m.cv_results_)
df.head()

{'alpha': 0.01, 'max_iter': 3000}
RMSE Value For Ridge Regression:  1.2330824


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_max_iter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.666357,0.11527,0.157903,0.016177,0.01,3000,"{'alpha': 0.01, 'max_iter': 3000}",-0.319907,-0.33372,-0.308792,-0.331414,-0.307177,-0.320202,0.011031,1
1,2.766894,0.186152,0.153905,0.007123,0.1,3000,"{'alpha': 0.1, 'max_iter': 3000}",-0.320821,-0.334402,-0.309202,-0.332085,-0.306891,-0.32068,0.011316,2
2,2.406317,0.194428,0.147909,0.005212,1.0,3000,"{'alpha': 1, 'max_iter': 3000}",-0.331717,-0.346895,-0.3188,-0.344515,-0.313837,-0.331153,0.01326,3
3,2.310176,0.089941,0.151307,0.002798,2.0,3000,"{'alpha': 2, 'max_iter': 3000}",-0.340535,-0.357129,-0.327796,-0.354344,-0.321016,-0.340164,0.014203,4
4,2.74191,0.24146,0.168497,0.019426,3.0,3000,"{'alpha': 3, 'max_iter': 3000}",-0.346602,-0.364047,-0.334121,-0.360993,-0.32627,-0.346407,0.0147,5


In [15]:
lasso_m_ = Lasso(normalize=True)

alpha  = 1/np.array([0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000])
lasso_params_ = { 'max_iter':[3000],'alpha':alpha}

grid_lasso_m = GridSearchCV( lasso_m_,lasso_params_,scoring = 'neg_root_mean_squared_error',cv=5)
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid) #log +1 

grid_lasso_m.fit( X_train , y_train_log )
preds = grid_lasso_m.predict(X_valid)
print (grid_lasso_m.best_params_)
print ("RMSE Value For Lasso Regression: ", np.sqrt(mse(np.exp(y_valid_log),np.exp(preds))))

{'alpha': 10.0, 'max_iter': 3000}
RMSE Value For Lasso Regression:  1.084341
