### Regularized Linear Regression Task

##### 다이아몬드 가격 예측

- price: 미국 달러로 표시된 가격 (＄326 ~ ＄18,823)
- carat: 다이아몬드의 무게(0.2 ~ 5.01)
- cut: 품질(공정, 좋음, 매우 좋음, 프리미엄, 이상적)
- color: 다이아몬드 색상, J(최악)부터 D(최우수)까지
- clarity: 다이아몬드가 얼마나 선명한지에 대한 측정값 (I1(최악), SI2, SI1, VS2, VS1, VVS2, VVS1, IF(최우수))
- x: 길이(mm) (0 ~ 10.74)
- y: 너비(mm)(0 ~ 58.9)
- z: 깊이(mm)(0 ~ 31.8)
- depth: 총 깊이 백분율 = z / 평균(x, y) = 2 * z / (x + y) (43–79)
- table: 가장 넓은 점에 대한 다이아몬드 상단 폭(43 ~ 95)

In [1]:
import pandas as pd
diamond_df = pd.read_csv('./datasets/diamond.csv')
diamond_df = diamond_df.drop(columns=diamond_df.columns[0], axis=1)
diamond_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74
53939,0.75,Ideal,D,SI2,62.2,55.0,2757,5.83,5.87,3.64
53940,0.71,Premium,E,SI1,60.5,55.0,2756,5.79,5.74,3.49
53941,0.71,Premium,F,SI1,59.8,62.0,2756,5.74,5.73,3.43


In [2]:
diamond_df = diamond_df.drop_duplicates()
diamond_df = diamond_df.reset_index(drop=True)
diamond_df.duplicated().sum()

0

In [3]:
from sklearn.preprocessing import LabelEncoder

encoders = []
columns = ['cut', 'color', 'clarity']

for column in columns:
    encoder = LabelEncoder()
    encoded_feature = encoder.fit_transform(diamond_df[column])
    diamond_df[column] = encoded_feature
    encoders.append(encoder)
    print(encoder.classes_)

['Fair' 'Good' 'Ideal' 'Premium' 'Very Good']
['D' 'E' 'F' 'G' 'H' 'I' 'J']
['I1' 'IF' 'SI1' 'SI2' 'VS1' 'VS2' 'VVS1' 'VVS2']


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
diamond_df['scaled_carat'] = scaler.fit_transform(diamond_df[['carat']])
diamond_df[~pd.Series(diamond_df.scaled_carat).between(-1.96, 1.96)]

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,scaled_carat
9829,1.74,4,4,0,63.2,55.0,4677,7.62,7.59,4.80,1.990385
11576,1.95,3,4,0,60.3,59.0,5045,8.10,8.05,4.87,2.433997
11606,2.00,3,6,0,61.5,59.0,5051,8.11,8.06,4.97,2.539620
11750,1.83,0,6,0,70.0,58.0,5083,7.34,7.28,5.12,2.180504
12218,2.06,3,6,0,61.2,58.0,5203,8.10,8.07,4.95,2.666366
...,...,...,...,...,...,...,...,...,...,...,...
27677,2.29,3,5,2,61.8,59.0,18797,8.52,8.45,5.24,3.152228
27678,2.00,4,4,2,62.8,57.0,18803,7.95,8.00,5.01,2.539620
27679,2.07,2,3,3,62.5,55.0,18804,8.20,8.13,5.11,2.687491
27681,2.00,4,3,2,63.5,56.0,18818,7.90,7.97,5.04,2.539620


In [5]:
diamond_df = diamond_df[pd.Series(diamond_df.scaled_carat).between(-1.96, 1.96)]
diamond_df = diamond_df.drop(columns='scaled_carat')
diamond_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53789,0.72,2,0,2,60.8,57.0,2757,5.75,5.76,3.50
53790,0.72,1,0,2,63.1,55.0,2757,5.69,5.75,3.61
53791,0.70,4,0,2,62.8,60.0,2757,5.66,5.68,3.56
53792,0.86,3,4,3,61.0,58.0,2757,6.15,6.12,3.74


In [6]:
diamond_df.reset_index(drop=True, inplace=True)
diamond_df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
51299,0.72,2,0,2,60.8,57.0,2757,5.75,5.76,3.50
51300,0.72,1,0,2,63.1,55.0,2757,5.69,5.75,3.61
51301,0.70,4,0,2,62.8,60.0,2757,5.66,5.68,3.56
51302,0.86,3,4,3,61.0,58.0,2757,6.15,6.12,3.74


In [7]:
diamond_df['target'] = diamond_df.price
diamond_df = diamond_df.drop(columns='price')

In [8]:
diamond_df

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,target
0,0.23,2,1,3,61.5,55.0,3.95,3.98,2.43,326
1,0.21,3,1,2,59.8,61.0,3.89,3.84,2.31,326
2,0.23,1,1,4,56.9,65.0,4.05,4.07,2.31,327
3,0.29,3,5,5,62.4,58.0,4.20,4.23,2.63,334
4,0.31,1,6,3,63.3,58.0,4.34,4.35,2.75,335
...,...,...,...,...,...,...,...,...,...,...
51299,0.72,2,0,2,60.8,57.0,5.75,5.76,3.50,2757
51300,0.72,1,0,2,63.1,55.0,5.69,5.75,3.61,2757
51301,0.70,4,0,2,62.8,60.0,5.66,5.68,3.56,2757
51302,0.86,3,4,3,61.0,58.0,6.15,6.12,3.74,2757


In [11]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error

def get_evaluation(y_test, prediction):
    MAE =  mean_absolute_error(y_test, prediction)
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)

    print('MAE: {:.4f}, MSE: {:.2f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'.format(MAE, MSE, RMSE, MSLE, RMSLE, R2))

In [12]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

features, targets = diamond_df.iloc[:, :-1], diamond_df.target

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.3, random_state=0)

# 로그 변환
y_train = np.log1p(y_train)

linear_regression = LinearRegression()

linear_regression.fit(X_train, y_train)

# 기울기(가중치)
print(linear_regression.coef_)
# 절편(상수)
print(linear_regression.intercept_)

# 지수를 취하여 원래 값으로 복구
prediction = np.expm1(linear_regression.predict(X_test))
get_evaluation(y_test, prediction)

[-0.17432914  0.00434467 -0.06740156  0.06227756  0.02367073 -0.00905064
  0.92481447  0.08076623  0.0399015 ]
1.0231862540386407
MAE: 575.7753, MSE: 1175948.06, RMSE: 1084.4114, MSLE: 0.0529, RMSLE: 0.2301, R2: 0.8900


In [16]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

features, targets = diamond_df.iloc[:, :-1], diamond_df.target

poly_features = PolynomialFeatures(degree=3).fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(poly_features, targets, test_size=0.3, random_state=0)

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

# 로그 변환
y_train = np.log1p(y_train)

lasso = Lasso(max_iter=10000)
params = {'alpha': [0.001, 0.01, 1, 10]}

# GridSearchCV에서 사용 가능한 점수 지표(scoring)들
# 'explained_variance', 'roc_auc', 'roc_auc_ovr', 'precision_weighted', 
# 'roc_auc_ovr_weighted', 'jaccard_samples', 'rand_score', 'neg_mean_gamma_deviance', 
# 'neg_log_loss', 'jaccard_weighted', 'adjusted_rand_score', 'adjusted_mutual_info_score', 
# 'roc_auc_ovo_weighted', 'positive_likelihood_ratio', 'accuracy', 'neg_median_absolute_error', 
# 'roc_auc_ovo', 'completeness_score', 'f1', 'f1_samples', 'normalized_mutual_info_score', 'r2', 
# 'recall_samples', 'matthews_corrcoef', 'precision_macro', 'v_measure_score', 'fowlkes_mallows_score', 
# 'neg_mean_absolute_error', 'recall_macro', 'precision_samples', 'average_precision', 'jaccard', 
# 'jaccard_micro', 'jaccard_macro', 'neg_root_mean_squared_error', 'f1_weighted', 
# 'homogeneity_score', 'recall', 'precision', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 
# ▶'neg_mean_squared_log_error'◀, 
# 'neg_negative_likelihood_ratio', 'precision_micro', 'recall_micro', 'recall_weighted', 'balanced_accuracy', 
# 'max_error', 'mutual_info_score', 'top_k_accuracy', 'f1_macro', 'neg_brier_score', 'f1_micro', 'neg_mean_absolute_percentage_error'
grid_lasso = GridSearchCV(lasso, param_grid=params, cv=5, refit=True, scoring="r2")
grid_lasso.fit(X_train, y_train)

prediction = grid_lasso.predict(X_test)
get_evaluation(np.log1p(y_test), prediction)

MAE: 0.1473, MSE: 0.04, RMSE: 0.1945, MSLE: 0.0005, RMSLE: 0.0228, R2: 0.9589


In [14]:
# DataFrame으로 변환
scores_df = pd.DataFrame(grid_lasso.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,{'alpha': 0.001},0.959898,1,0.960976,0.953768,0.962214
1,{'alpha': 0.01},0.947754,2,0.949558,0.932571,0.951448
2,{'alpha': 1},-8.3e-05,3,-0.000138,-8.1e-05,-1.7e-05
3,{'alpha': 10},-8.3e-05,3,-0.000138,-8.1e-05,-1.7e-05


In [22]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures

features, targets = diamond_df.iloc[:, :-1], diamond_df.target

poly_features = PolynomialFeatures(degree=3).fit_transform(features)

X_train, X_test, y_train, y_test = train_test_split(poly_features, targets, test_size=0.3, random_state=0)

X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

# 로그 변환
y_train = np.log1p(y_train)

ridge = Ridge(max_iter=500)
params = {'alpha': [0.001, 0.01, 1, 10]}

# scoring을 neg_mean_squared_log_error로 설정했을 때 음수 연산 오류가 발생하면, r2로 변경해준다.
# grid_ridge = GridSearchCV(ridge, param_grid=params, cv=5, refit=True, scoring="neg_mean_squared_log_error")
grid_ridge = GridSearchCV(ridge, param_grid=params, cv=5, refit=True, scoring="r2")
grid_ridge.fit(X_train, y_train)

prediction = grid_ridge.predict(X_test)
get_evaluation(np.log1p(y_test), prediction)

MAE: 0.1495, MSE: 0.04, RMSE: 0.1969, MSLE: 0.0005, RMSLE: 0.0229, R2: 0.9579


In [23]:
# DataFrame으로 변환
scores_df = pd.DataFrame(grid_lasso.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,{'alpha': 0.001},0.959898,1,0.960976,0.953768,0.962214
1,{'alpha': 0.01},0.947754,2,0.949558,0.932571,0.951448
2,{'alpha': 1},-8.3e-05,3,-0.000138,-8.1e-05,-1.7e-05
3,{'alpha': 10},-8.3e-05,3,-0.000138,-8.1e-05,-1.7e-05
