In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

# 1단계: 데이터 탐색 및 시각화, 전처리
# 데이터 로드
file_path = 'machine.data.csv'
df = pd.read_csv(file_path, header=None)

# 컬럼 이름 추가
df.columns = ['VendorName', 'ModelName', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP']

# 필요 없는 열 제거
df = df.drop(['VendorName', 'ModelName'], axis=1)

# 비수치 값이 있는 행 제거
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

# 데이터 요약
print(df.info())
print(df.describe())

# 타겟 변수와 특징 변수 분리
X = df[['MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX']]
y = df['PRP']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 파이프라인 설정
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])

# 모델 학습
pipeline.fit(X_train, y_train)

# 예측
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

# 성능 평가
mse_train = mean_squared_error(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

mse_test = mean_squared_error(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# 결과 출력
print(f'Train MSE: {mse_train}')
print(f'Train MAE: {mae_train}')
print(f'Train R2: {r2_train}')
print(f'Test MSE: {mse_test}')
print(f'Test MAE: {mae_test}')
print(f'Test R2: {r2_test}')

# 교차 검증
cross_val_scores = cross_val_score(pipeline, X, y, cv=10, scoring='r2')
print(f'Cross-validation R2 scores: {cross_val_scores}')
print(f'Mean Cross-validation R2 score: {cross_val_scores.mean()}')

# 성능 지표 저장
results = pd.DataFrame({
    'Model': ['Linear Regression'] * 3,
    'Data': ['Train', 'Test', 'Cross Validation'],
    'MSE': [mse_train, mse_test, cross_val_scores.mean()],
    'MAE': [mae_train, mae_test, cross_val_scores.mean()],
    'R^2': [r2_train, r2_test, cross_val_scores.mean()]
})

plt.show()


<class 'pandas.core.frame.DataFrame'>
Index: 209 entries, 1 to 209
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   MYCT    209 non-null    float64
 1   MMIN    209 non-null    float64
 2   MMAX    209 non-null    float64
 3   CACH    209 non-null    float64
 4   CHMIN   209 non-null    float64
 5   CHMAX   209 non-null    float64
 6   PRP     209 non-null    float64
dtypes: float64(7)
memory usage: 13.1 KB
None
              MYCT          MMIN          MMAX        CACH       CHMIN  \
count   209.000000    209.000000    209.000000  209.000000  209.000000   
mean    203.822967   2867.980861  11796.153110   25.205742    4.698565   
std     260.262926   3878.742758  11726.564377   40.628722    6.816274   
min      17.000000     64.000000     64.000000    0.000000    0.000000   
25%      50.000000    768.000000   4000.000000    0.000000    1.000000   
50%     110.000000   2000.000000   8000.000000    8.000000    2.000000   
75% 

In [7]:
# 3단계: 모델 개선 
# 파이프라인 설정 및 하이퍼파라미터 그리드 설정
param_grid = [
    {'model': [LinearRegression()]},
    {'model': [Ridge()], 'model__alpha': [0.1, 1, 10]},
    {'model': [Lasso()], 'model__alpha': [0.1, 1, 10]},
    {'model': [RandomForestRegressor()], 'model__n_estimators': [100, 200], 'model__max_features': ['sqrt', 'log2', None]}
]

# 그리드 서치 설정
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2')

# 모델 학습
grid_search.fit(X_train, y_train)

# 베스트 모델 선택
best_model = grid_search.best_estimator_

# 예측
y_train_pred_best = best_model.predict(X_train)
y_test_pred_best = best_model.predict(X_test)

# 성능 평가
mse_train_best = mean_squared_error(y_train, y_train_pred_best)
mae_train_best = mean_absolute_error(y_train, y_train_pred_best)
r2_train_best = r2_score(y_train, y_train_pred_best)

mse_test_best = mean_squared_error(y_test, y_test_pred_best)
mae_test_best = mean_absolute_error(y_test, y_test_pred_best)
r2_test_best = r2_score(y_test, y_test_pred_best)

# 교차 검증
cross_val_scores_best = cross_val_score(best_model, X, y, cv=10, scoring='r2')
cross_val_mean_best = cross_val_scores_best.mean()

# 성능 출력
print(f'Best Model Train MSE: {mse_train_best}')
print(f'Best Model Train MAE: {mae_train_best}')
print(f'Best Model Train R2: {r2_train_best}')
print(f'Best Model Test MSE: {mse_test_best}')
print(f'Best Model Test MAE: {mae_test_best}')
print(f'Best Model Test R2: {r2_test_best}')
print(f'Best Model Cross-Validation R2: {cross_val_mean_best}')

# 성능 지표 저장 및 시각화
results_best = pd.DataFrame({
    'Model': ['Best_estimator'] * 3,
    'Data': ['Train', 'Test', 'Cross Validation'],
    'MSE': [mse_train_best, mse_test_best, cross_val_mean_best],
    'MAE': [mae_train_best, mae_test_best, cross_val_mean_best],
    'R^2': [r2_train_best, r2_test_best, cross_val_mean_best]
})

plt.show()

Best Model Train MSE: 629.3791201403061
Best Model Train MAE: 11.1217001235624
Best Model Train R2: 0.9672615295974538
Best Model Test MSE: 6654.03594989972
Best Model Test MAE: 33.18414323507181
Best Model Test R2: 0.8692899769884158
Best Model Cross-Validation R2: 0.56509896375104
