In [62]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV, train_test_split

%matplotlib inline

In [46]:
ecommerce = pd.read_csv('../pandas-numpy-practices/ecommerce.csv')

In [47]:
ecommerce.head()

Unnamed: 0,Email,Address,Avatar,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,mstephenson@fernandez.com,"835 Frank Tunnel\nWrightmouth, MI 82180-9605",Violet,34.497268,12.655651,39.577668,4.082621,587.951054
1,hduke@hotmail.com,"4547 Archer Common\nDiazchester, CA 06566-8576",DarkGreen,31.926272,11.109461,37.268959,2.664034,392.204933
2,pallen@yahoo.com,"24645 Valerie Unions Suite 582\nCobbborough, D...",Bisque,33.000915,11.330278,37.110597,4.104543,487.547505
3,riverarebecca@gmail.com,"1414 David Throughway\nPort Jason, OH 22070-1220",SaddleBrown,34.305557,13.717514,36.721283,3.120179,581.852344
4,mstephens@davidson-herman.com,"14023 Rodriguez Passage\nPort Jacobville, PR 3...",MediumAquaMarine,33.330673,12.795189,37.536653,4.446308,599.406092


In [48]:
ecommerce = ecommerce.drop(['Email', 'Address', 'Avatar'], axis=1)

In [49]:
ecommerce.head()

Unnamed: 0,Avg. Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,34.497268,12.655651,39.577668,4.082621,587.951054
1,31.926272,11.109461,37.268959,2.664034,392.204933
2,33.000915,11.330278,37.110597,4.104543,487.547505
3,34.305557,13.717514,36.721283,3.120179,581.852344
4,33.330673,12.795189,37.536653,4.446308,599.406092


In [12]:
X_data = ecommerce.loc[:, :"Length of Membership"]
y_target = ecommerce['Yearly Amount Spent']

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_target,
                                                    test_size=0.25, random_state=156)

In [54]:
X_train.shape, y_train.shape

((375, 4), (375,))

In [56]:
X_test.shape, y_test.shape

((125, 4), (125,))

In [90]:
lr = LinearRegression() # 선형회귀 모델 생성
lr.fit(X_train, y_train) # 훈련용 데이터로 모델 학습
y_pred = lr.predict(X_test) # 테스트 데이터로 모델 예측
mse = mean_squared_error(y_test, y_pred) # 모델 성능 평가 - mse
rmse = np.sqrt(mse) # 모델 성능 평가 - rmse

mse, rmse

(106.71220615117356, 10.330160025438792)

In [93]:
lr.coef_

array([25.43111192, 38.56299836,  0.44094951, 61.86205426])

In [94]:
lr = LinearRegression(normalize=True) # 선형회귀 모델 생성
lr.fit(X_train, y_train) # 훈련용 데이터로 모델 학습
y_pred = lr.predict(X_test) # 테스트 데이터로 모델 예측
mse = mean_squared_error(y_test, y_pred) # 모델 성능 평가 - mse
rmse = np.sqrt(mse) # 모델 성능 평가 - rmse

mse, rmse

(106.7122061511738, 10.330160025438802)

In [137]:
ridge_best = Ridge(alpha=0.0)
ridge_best.fit(X_train, y_train)

y_pred = ridge_best.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mse, rmse

(106.71220615117292, 10.33016002543876)

In [115]:
np.linspace(0, 0.01, 100)

array([0.        , 0.00010101, 0.00020202, 0.00030303, 0.00040404,
       0.00050505, 0.00060606, 0.00070707, 0.00080808, 0.00090909,
       0.0010101 , 0.00111111, 0.00121212, 0.00131313, 0.00141414,
       0.00151515, 0.00161616, 0.00171717, 0.00181818, 0.00191919,
       0.0020202 , 0.00212121, 0.00222222, 0.00232323, 0.00242424,
       0.00252525, 0.00262626, 0.00272727, 0.00282828, 0.00292929,
       0.0030303 , 0.00313131, 0.00323232, 0.00333333, 0.00343434,
       0.00353535, 0.00363636, 0.00373737, 0.00383838, 0.00393939,
       0.0040404 , 0.00414141, 0.00424242, 0.00434343, 0.00444444,
       0.00454545, 0.00464646, 0.00474747, 0.00484848, 0.00494949,
       0.00505051, 0.00515152, 0.00525253, 0.00535354, 0.00545455,
       0.00555556, 0.00565657, 0.00575758, 0.00585859, 0.0059596 ,
       0.00606061, 0.00616162, 0.00626263, 0.00636364, 0.00646465,
       0.00656566, 0.00666667, 0.00676768, 0.00686869, 0.0069697 ,
       0.00707071, 0.00717172, 0.00727273, 0.00737374, 0.00747

In [129]:
parameters = {'alpha':np.linspace(0, 0.01, 100)}
ridge_base = Ridge()
grid_dtree = GridSearchCV(ridge_base, param_grid=parameters, cv=5,
                          scoring="neg_root_mean_squared_error",
                          refit=True, return_train_score=True)

In [138]:
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=Ridge(),
             param_grid={'alpha': array([0.        , 0.00010101, 0.00020202, 0.00030303, 0.00040404,
       0.00050505, 0.00060606, 0.00070707, 0.00080808, 0.00090909,
       0.0010101 , 0.00111111, 0.00121212, 0.00131313, 0.00141414,
       0.00151515, 0.00161616, 0.00171717, 0.00181818, 0.00191919,
       0.0020202 , 0.00212121, 0.00222222, 0.00232323, 0.00242424,
       0.00252525, 0.00262626,...
       0.00707071, 0.00717172, 0.00727273, 0.00737374, 0.00747475,
       0.00757576, 0.00767677, 0.00777778, 0.00787879, 0.0079798 ,
       0.00808081, 0.00818182, 0.00828283, 0.00838384, 0.00848485,
       0.00858586, 0.00868687, 0.00878788, 0.00888889, 0.0089899 ,
       0.00909091, 0.00919192, 0.00929293, 0.00939394, 0.00949495,
       0.00959596, 0.00969697, 0.00979798, 0.00989899, 0.01      ])},
             return_train_score=True, scoring='neg_root_mean_squared_error')

In [139]:
grid_dtree.best_params_

{'alpha': 0.0}

In [140]:
grid_dtree.best_score_

-9.95109835552399

# cross_val_score
scoring parameter
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [64]:
lr = LinearRegression()
scores = cross_val_score(lr, X_train, y_train, scoring="r2", cv=5)
print(scores)
print(np.mean(scores))

[0.98554349 0.98271562 0.97820541 0.98642628 0.98364344]
0.9833068484342375


In [73]:
lr = LinearRegression()
scores = cross_val_score(lr, X_train, y_train, scoring="neg_mean_squared_error", cv=5)
print(scores*-1) # 양수로 만들기 위해 -1 곱함
print(np.mean(scores)*-1) # 양수

[ 97.90333916 101.73093382  86.33728373 102.00177897 107.81345764]
99.15735866590276


In [74]:
lr = LinearRegression()
scores = cross_val_score(lr, X_train, y_train, scoring="neg_root_mean_squared_error", cv=5)
print(scores*-1)
print(np.mean(scores)*-1)

[ 9.89461162 10.08617538  9.29178582 10.09959301 10.38332594]
9.951098355523987
