In [1]:
# ----------------------------------------------------------------------------------------------------
# 라이브러리 목록

# 기본 라이브러리 
import math
import numpy as np
import pandas as pd

# sklearn 라이브러리 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.neighbors import KNeighborsRegressor

# pandas 설정
pd.set_option('display.max_rows', None)  # 모든 행 출력
pd.set_option('display.max_columns', None)  # 모든 열 출력

# 개인 라이브러리  
from preparation_for_analysis.show_window import DataVisualizer
from preparation_for_analysis.encoding import DataFrameEncoder

# preparation_for_analysis 설정
visualizer = DataVisualizer(line="=", length=100, start="#")
# ----------------------------------------------------------------------------------------------------

In [2]:
# ----------------------------------------------------------------------------------------------------
# learning 불러오기 
learning = pd.read_csv(
    "C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\1_learning.csv",
    sep=",",
    header=0
)

visualizer.show_df_info(title="learning.info", df=learning)
# ----------------------------------------------------------------------------------------------------


# Title: learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6747 entries, 0 to 6746
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            6747 non-null   object 
 1   Model         6747 non-null   object 
 2   Manufacturer  6747 non-null   object 
 3   Model_year    6747 non-null   int64  
 4   Drivetrain    6747 non-null   object 
 5   Warranty      6747 non-null   int64  
 6   Accident      6747 non-null   object 
 7   Condition     6747 non-null   object 
 8   Battery       4311 non-null   float64
 9   Mileage       6747 non-null   int64  
 10  Price         6747 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 579.9+ KB







In [3]:
# ----------------------------------------------------------------------------------------------------
# validation 불러오기 
validation = pd.read_csv(
    "C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\d_validation\\1_validation.csv",
    sep=",",
    header=0
)

visualizer.show_df_info(title="learning.info", df=validation)
# ---------------------------------------------------------------------------------------------------


# Title: learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            750 non-null    object 
 1   Model         750 non-null    object 
 2   Manufacturer  750 non-null    object 
 3   Model_year    750 non-null    int64  
 4   Drivetrain    750 non-null    object 
 5   Warranty      750 non-null    int64  
 6   Accident      750 non-null    object 
 7   Condition     750 non-null    object 
 8   Battery       475 non-null    float64
 9   Mileage       750 non-null    int64  
 10  Price         750 non-null    float64
dtypes: float64(2), int64(3), object(6)
memory usage: 64.6+ KB







In [4]:
# ----------------------------------------------------------------------------------------------------
# new_learning 생성 
new_learning = learning.copy()
# ----------------------------------------------------------------------------------------------------

In [5]:
# ----------------------------------------------------------------------------------------------------
# "Model_year"컬럼 값 범주형으로 수정 
new_learning["Model_year"] = new_learning["Model_year"].apply(lambda x: "y"+str(x))
# ----------------------------------------------------------------------------------------------------

In [6]:
# ----------------------------------------------------------------------------------------------------
# "Condition"컬럼 수정 
def condition_modification(x):
    if 0 <= x < 10000:
        return "Brand New"
    elif 10000 <= x < 50000:
        return "Nearly New"
    elif 50000 <= x < 200000:
        return "Pre-Owned"
    else:
        return "Unknown"  # 0 또는 범위를 벗어난 값 처리
    
new_learning["Condition"] = new_learning["Mileage"].apply(lambda x: condition_modification(x))
# ----------------------------------------------------------------------------------------------------

In [7]:
# ----------------------------------------------------------------------------------------------------
# "Missing"컬럼 생성 
def missing_detection(x):
    if pd.isnull(x):
        return "missing"
    else:
        return "no_missing"
    
new_learning["Missing"] = new_learning["Battery"].apply(lambda x: missing_detection(x))
# ----------------------------------------------------------------------------------------------------

In [8]:
# ----------------------------------------------------------------------------------------------------
# battery_reference 생성

# battery_df 생성 
battery_df = new_learning.loc[
    pd.notnull(new_learning["Battery"]), 
    ["Model","Model_year","Warranty","Condition","Battery"]
].value_counts().reset_index().copy()
battery_df.drop(columns=["count"], inplace=True)
battery_df["Missing"] = "no_missing"
battery_df.sort_values(by=["Model","Model_year","Warranty","Condition"],
                       ascending=[True,True,True,False],
                       ignore_index=True,
                       inplace=True)

# battery_missing_df 생성 
battery_missing_df = new_learning.loc[
    pd.isnull(new_learning["Battery"]), 
    ["Model","Model_year","Warranty","Condition"]
].value_counts().reset_index().copy()
battery_missing_df.drop(columns=["count"], inplace=True)
battery_missing_df["Battery"] = np.nan
battery_missing_df["Missing"] = "missing"
battery_missing_df.sort_values(by=["Model","Model_year","Warranty","Condition"],
                               ascending=[True,True,True,False],
                               ignore_index=True,
                               inplace=True)

# battery_reference 생성 
battery_reference = pd.concat([battery_df, battery_missing_df], axis=0, ignore_index=True)
battery_reference.sort_values(by=["Model","Model_year","Warranty","Condition","Battery"],
                              ascending=[True,True,True,False,True],
                              ignore_index=True,
                              na_position="first",
                              inplace=True)


battery_reference["Battery"] = battery_reference["Battery"].fillna(method="bfill")

# battery_reference 정보 확인 
visualizer.show_df_info(title="battery_reference.info", df=battery_reference)
# ----------------------------------------------------------------------------------------------------


# Title: battery_reference.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 334 entries, 0 to 333
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Model       334 non-null    object 
 1   Model_year  334 non-null    object 
 2   Warranty    334 non-null    int64  
 3   Condition   334 non-null    object 
 4   Battery     334 non-null    float64
 5   Missing     334 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 15.8+ KB







In [9]:
# ----------------------------------------------------------------------------------------------------
# Battery 컬럼 수정 
# new_learning과 battery_reference 조인 

new_learning = pd.merge(
    new_learning, 
    battery_reference.loc[battery_reference["Missing"]=="missing",:], 
    on=["Model","Model_year","Warranty","Condition","Missing"], 
    how="left") 

new_learning["Battery_x"].fillna(new_learning["Battery_y"], inplace=True)
new_learning.drop(columns=["Battery_y"], inplace=True)
new_learning.rename(columns={"Battery_x": "Battery"}, inplace=True)
# ----------------------------------------------------------------------------------------------------

In [10]:
# ----------------------------------------------------------------------------------------------------
# "Warranty"컬럼 값 범주형으로 수정 
new_learning["Warranty"] = new_learning["Warranty"].apply(lambda x: "w"+str(x))
# ----------------------------------------------------------------------------------------------------

In [11]:
# ----------------------------------------------------------------------------------------------------
# 컬럼 순서 재정렬 
new_learning = new_learning[["Id","Model","Manufacturer","Model_year","Drivetrain",
                             "Warranty","Accident","Condition","Battery","Mileage",
                             "Price"]]

# new_learning 정보 확인
visualizer.show_df_info(title="new_learning.info", df=new_learning)
# ----------------------------------------------------------------------------------------------------


# Title: new_learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6747 entries, 0 to 6746
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            6747 non-null   object 
 1   Model         6747 non-null   object 
 2   Manufacturer  6747 non-null   object 
 3   Model_year    6747 non-null   object 
 4   Drivetrain    6747 non-null   object 
 5   Warranty      6747 non-null   object 
 6   Accident      6747 non-null   object 
 7   Condition     6747 non-null   object 
 8   Battery       6747 non-null   float64
 9   Mileage       6747 non-null   int64  
 10  Price         6747 non-null   float64
dtypes: float64(2), int64(1), object(8)
memory usage: 579.9+ KB







In [12]:
# ----------------------------------------------------------------------------------------------------
# new_validation 생성 
new_validation = validation.copy()
# ----------------------------------------------------------------------------------------------------

In [13]:
# ----------------------------------------------------------------------------------------------------
# "Model_year"컬럼 값 범주형으로 수정 
new_validation["Model_year"] = new_validation["Model_year"].apply(lambda x: "y"+str(x))
# ----------------------------------------------------------------------------------------------------

In [14]:
# ----------------------------------------------------------------------------------------------------
# "Condition"컬럼 수정 
new_validation["Condition"] = new_validation["Mileage"].apply(lambda x: condition_modification(x))
# ----------------------------------------------------------------------------------------------------

In [15]:
# ----------------------------------------------------------------------------------------------------
# "Missing"컬럼 생성 
new_validation["Missing"] = new_validation["Battery"].apply(lambda x: missing_detection(x))
# ----------------------------------------------------------------------------------------------------

In [16]:
# ----------------------------------------------------------------------------------------------------
# Battery 컬럼 수정 
# new_validation과 battery_reference 조인 

new_validation = pd.merge(
    new_validation, 
    battery_reference.loc[battery_reference["Missing"]=="missing",:], 
    on=["Model","Model_year","Warranty","Condition","Missing"], 
    how="left") 

new_validation["Battery_x"].fillna(new_validation["Battery_y"], inplace=True)
new_validation.drop(columns=["Battery_y"], inplace=True)
new_validation.rename(columns={"Battery_x": "Battery"}, inplace=True)
# ----------------------------------------------------------------------------------------------------

In [17]:
# ----------------------------------------------------------------------------------------------------
# "Warranty"컬럼 값 범주형으로 수정 
new_validation["Warranty"] = new_validation["Warranty"].apply(lambda x: "w"+str(x))
# ----------------------------------------------------------------------------------------------------

In [18]:
# ----------------------------------------------------------------------------------------------------
# 컬럼 순서 재정렬 
new_validation = new_validation[["Id","Model","Manufacturer","Model_year","Drivetrain",
                                 "Warranty","Accident","Condition","Battery","Mileage", 
                                 "Price"]]

new_validation["Battery"] = new_validation["Battery"].fillna(46.15)

# new_valdiation 정보 확인
visualizer.show_df_info(title="new_validation.info", df=new_validation)
# ----------------------------------------------------------------------------------------------------


# Title: new_validation.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Id            750 non-null    object 
 1   Model         750 non-null    object 
 2   Manufacturer  750 non-null    object 
 3   Model_year    750 non-null    object 
 4   Drivetrain    750 non-null    object 
 5   Warranty      750 non-null    object 
 6   Accident      750 non-null    object 
 7   Condition     750 non-null    object 
 8   Battery       750 non-null    float64
 9   Mileage       750 non-null    int64  
 10  Price         750 non-null    float64
dtypes: float64(2), int64(1), object(8)
memory usage: 64.6+ KB







In [19]:
# ----------------------------------------------------------------------------------------------------
# ("Price"컬럼 log(y+1)변환)
# new_learning_encoding 구성하기

# onehot 인코더 호출
new_learning_encoder = DataFrameEncoder(df=new_learning,
                                        columns=["Model","Manufacturer","Model_year","Drivetrain",
                                                 "Warranty","Accident","Condition"],
                                        ascending_order=[True,True,True,True,
                                                         True,True,True],
                                        sort_by_number=[False,False,True,False,
                                                        True,False,False])

# onehot 인코딩 변환 / learning_encoding 생성 
new_learning_encoding = new_learning_encoder.fit_transform(encoding_type="onehot")

# MinMax Normalization 변수 
battery_max = new_learning["Battery"].max()
battery_min = new_learning["Battery"].min()
mileage_max = new_learning["Mileage"].max()
mileage_min = new_learning["Mileage"].min()

# MinMax Normalization 
new_learning_encoding["Battery"] = new_learning["Battery"].apply(lambda x : (x-battery_min)/(battery_max-battery_min))
new_learning_encoding["Mileage"] = new_learning["Mileage"].apply(lambda x : (x-mileage_min)/(mileage_max-mileage_min))

# log(y+1) 변환 
new_learning_encoding["Price"] = new_learning["Price"].apply(lambda x : np.log1p(x))

# learning_encoding "float32"로 전환  
new_learning_encoding = new_learning_encoding.astype("float32")

# learning_encoding 정보 확인
visualizer.show_df_info(title="new_learning_encoding.info",df=new_learning_encoding)
# ----------------------------------------------------------------------------------------------------


# Title: new_learning_encoding.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6747 entries, 0 to 6746
Data columns (total 53 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EV6         6747 non-null   float32
 1   ID4         6747 non-null   float32
 2   ION5        6747 non-null   float32
 3   ION6        6747 non-null   float32
 4   IONIQ       6747 non-null   float32
 5   KNE         6747 non-null   float32
 6   M3          6747 non-null   float32
 7   MS          6747 non-null   float32
 8   MX          6747 non-null   float32
 9   MY          6747 non-null   float32
 10  Niro        6747 non-null   float32
 11  Q4eT        6747 non-null   float32
 12  RSeTGT      6747 non-null   float32
 13  Soul        6747 non-null   float32
 14  Tay         6747 non-null   float32
 15  TayCT       6747 non-null   float32
 16  TayGTS      6747 non-null   float32
 17  eT          6747 non-null   float32
 18  i3 

In [20]:
# ----------------------------------------------------------------------------------------------------
# ("Price"컬럼 log(y+1)변환)
# new_validation_encoding 구성하기

# onehot 인코더 호출
new_validation_encoder = DataFrameEncoder(df=new_validation,
                                          columns=["Model","Manufacturer","Model_year","Drivetrain",
                                                   "Warranty","Accident","Condition"],
                                          ascending_order=[True,True,True,True,
                                                           True,True,True],
                                          sort_by_number=[False,False,True,False,
                                                          True,False,False])

# onehot 인코딩 변환 / validation_encoding 생성 
new_validation_encoding = new_validation_encoder.fit_transform(encoding_type="onehot")

# MinMax Normalization 변수 
battery_max = new_learning["Battery"].max()
battery_min = new_learning["Battery"].min()
mileage_max = new_learning["Mileage"].max()
mileage_min = new_learning["Mileage"].min()

# MinMax Normalization 
new_validation_encoding["Battery"] = new_validation["Battery"].apply(lambda x : (x-battery_min)/(battery_max-battery_min))
new_validation_encoding["Mileage"] = new_validation["Mileage"].apply(lambda x : (x-mileage_min)/(mileage_max-mileage_min))

# log(y+1) 변환 
new_validation_encoding["Price"] = new_validation["Price"].apply(lambda x : np.log1p(x))

# validation_encoding "float32"로 전환  
new_validation_encoding = new_validation_encoding.astype("float32")

# new_validation_encoding 정보 확인
visualizer.show_df_info(title="new_validation_encoding.info",df=new_validation_encoding)
# ----------------------------------------------------------------------------------------------------


# Title: new_validation_encoding.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 53 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EV6         750 non-null    float32
 1   ID4         750 non-null    float32
 2   ION5        750 non-null    float32
 3   ION6        750 non-null    float32
 4   IONIQ       750 non-null    float32
 5   KNE         750 non-null    float32
 6   M3          750 non-null    float32
 7   MS          750 non-null    float32
 8   MX          750 non-null    float32
 9   MY          750 non-null    float32
 10  Niro        750 non-null    float32
 11  Q4eT        750 non-null    float32
 12  RSeTGT      750 non-null    float32
 13  Soul        750 non-null    float32
 14  Tay         750 non-null    float32
 15  TayCT       750 non-null    float32
 16  TayGTS      750 non-null    float32
 17  eT          750 non-null    float32
 18  i3 

In [21]:
# ----------------------------------------------------------------------------------------------------
# 훈련 데이터 준비 
x_train = new_learning_encoding.iloc[:, :-1].values  # numpy array 변환
y_train = new_learning_encoding.iloc[:, -1].values
x_test = new_validation_encoding.iloc[:, :-1].values
y_test = new_validation_encoding.iloc[:, -1].values

# KFold 설정 (5-fold)
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# 하이퍼파라미터 탐색 공간 정의
param_grid = {
    'n_neighbors': [1, 2, 3, 4, 5],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# KNN 모델 생성
knn = KNeighborsRegressor()

# GridSearchCV 실행
grid_search = GridSearchCV(knn, param_grid, cv=kf, scoring='neg_root_mean_squared_error', return_train_score=True)
grid_search.fit(x_train, y_train)

# 최적의 하이퍼파라미터 찾기
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# 각 fold의 train/validation 인덱스 저장
fold_data = list(kf.split(x_train, y_train))

# 각 Fold에 대해 훈련 후 훈련/검증 데이터 RMSE 계산
fold_rmse_train = []  # 각 fold의 훈련 RMSE 값을 저장할 리스트
fold_rmse_val = []  # 각 fold의 검증 RMSE 값을 저장할 리스트

for fold_idx in range(kf.get_n_splits()):
    # Fold의 훈련 및 검증 데이터 가져오기
    train_idx, valid_idx = fold_data[fold_idx]
    X_train_fold, y_train_fold = x_train[train_idx], y_train[train_idx]
    X_valid_fold, y_valid_fold = x_train[valid_idx], y_train[valid_idx]  # 올바르게 수정됨
    
    # 최적 하이퍼파라미터로 모델 훈련
    fold_knn = KNeighborsRegressor(**best_params)
    fold_knn.fit(X_train_fold, y_train_fold)
    
    # 훈련 데이터 예측
    train_preds = fold_knn.predict(X_train_fold)
    train_rmse = np.sqrt(((y_train_fold - train_preds)**2).mean())
    
    # 검증 데이터 예측
    valid_preds = fold_knn.predict(X_valid_fold)
    valid_rmse = np.sqrt(((y_valid_fold - valid_preds)**2).mean())
    
    # 결과 출력
    print(f"Fold {fold_idx + 1}:")
    print(f"  Training RMSE: {train_rmse:.4f}")
    print(f"  Validation RMSE: {valid_rmse:.4f}")
    print("---------------------------------------------------")
    
    # 각 Fold의 훈련 및 검증 데이터 RMSE 값을 리스트에 저장
    fold_rmse_train.append(train_rmse)
    fold_rmse_val.append(valid_rmse)

# 선택 기준을 설정하는 부분 (훈련 RMSE 또는 검증 RMSE)
print("Select which RMSE to use for the best fold:")
print("1: Training RMSE")
print("2: Validation RMSE")
selected_criterion = int(input("Enter 1 or 2: "))

# 최적 fold의 인덱스 찾기 (훈련 RMSE 또는 검증 RMSE가 가장 낮은 Fold 선택)
if selected_criterion == 1:
    best_fold_idx = np.argmin(fold_rmse_train)
    print(f"Best fold based on Training RMSE: Fold {best_fold_idx + 1}")
elif selected_criterion == 2:
    best_fold_idx = np.argmin(fold_rmse_val)
    print(f"Best fold based on Validation RMSE: Fold {best_fold_idx + 1}")
else:
    raise ValueError("Invalid input! Please select 1 or 2.")

# 최적 fold의 학습 데이터셋 가져오기
best_train_idx, _ = fold_data[best_fold_idx]
X_best_train, y_best_train = x_train[best_train_idx], y_train[best_train_idx]

# 최적 fold 데이터로 모델 다시 훈련
best_knn = KNeighborsRegressor(**best_params)
best_knn.fit(X_best_train, y_best_train)

# 테스트 데이터로 최적 모델 평가
final_test_preds = best_knn.predict(x_test)
final_test_rmse = np.sqrt(((y_test - final_test_preds)**2).mean())

print(f"Test RMSE with best model: {final_test_rmse:.4f}")
# ----------------------------------------------------------------------------------------------------

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
Fold 1:
  Training RMSE: 0.0001
  Validation RMSE: 0.0500
---------------------------------------------------
Fold 2:
  Training RMSE: 0.0008
  Validation RMSE: 0.0477
---------------------------------------------------
Fold 3:
  Training RMSE: 0.0008
  Validation RMSE: 0.0517
---------------------------------------------------
Fold 4:
  Training RMSE: 0.0001
  Validation RMSE: 0.0447
---------------------------------------------------
Fold 5:
  Training RMSE: 0.0008
  Validation RMSE: 0.0481
---------------------------------------------------
Select which RMSE to use for the best fold:
1: Training RMSE
2: Validation RMSE
Best fold based on Validation RMSE: Fold 4
Test RMSE with best model: 0.0359


In [86]:
# ----------------------------------------------------------------------------------------------------
# new_learning_encoding 예측 rmse 값 
pred_knn = best_knn.predict(x_train)
rmse = np.sqrt(((y_train - pred_knn)**2).mean())
print(f"RMSE: {rmse:.4f}")
# ----------------------------------------------------------------------------------------------------

RMSE: 0.0200


In [87]:
# ----------------------------------------------------------------------------------------------------
# new_validation_encoding 예측 rmse 값 
pred_knn = best_knn.predict(x_test)
rmse = np.sqrt(((y_test - pred_knn)**2).mean())
print(f"RMSE: {rmse:.4f}")
# ----------------------------------------------------------------------------------------------------

RMSE: 0.0359


In [243]:
# ----------------------------------------------------------------------------------------------------
# new_learning 재정의 
new_learning["knn"] = best_knn.predict(x_train)
new_learning["knn"] = new_learning["knn"].apply(lambda x: np.expm1(x))
new_learning["knn"] = new_learning["knn"].apply(lambda x: math.floor(x * 100) / 100)
new_learning = new_learning[
    ["Model","Manufacturer","Model_year","Drivetrain","Warranty",
     "Accident","Condition","Battery","Mileage","knn","Price"]
]
visualizer.show_df_info(title="new_learning.info", df=new_learning)

# new_validation 재정의 
new_validation["knn"] = best_knn.predict(x_test)
new_validation["knn"] = new_validation["knn"].apply(lambda x: np.expm1(x))
new_validation["knn"] = new_validation["knn"].apply(lambda x: math.floor(x * 100) / 100)
new_validation = new_validation[
    ["Model","Manufacturer","Model_year","Drivetrain","Warranty",
     "Accident","Condition","Battery","Mileage","knn","Price"]
]
visualizer.show_df_info(title="new_validation.info", df=new_validation)
# ----------------------------------------------------------------------------------------------------


# Title: new_learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6747 entries, 0 to 6746
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         6747 non-null   object 
 1   Manufacturer  6747 non-null   object 
 2   Model_year    6747 non-null   object 
 3   Drivetrain    6747 non-null   object 
 4   Warranty      6747 non-null   object 
 5   Accident      6747 non-null   object 
 6   Condition     6747 non-null   object 
 7   Battery       6747 non-null   float64
 8   Mileage       6747 non-null   int64  
 9   knn           6747 non-null   float64
 10  Price         6747 non-null   float64
dtypes: float64(3), int64(1), object(7)
memory usage: 579.9+ KB






# Title: new_validation.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---

In [244]:
# ----------------------------------------------------------------------------------------------------
# new_validation 예측 rmse 값 
rmse = np.sqrt(((new_validation["Price"] - new_validation["knn"])**2).mean())
print(f"RMSE: {rmse:.4f}")
# ----------------------------------------------------------------------------------------------------

RMSE: 1.2695


In [245]:
# ----------------------------------------------------------------------------------------------------
# knn 예측값 사후 보정

# 학습 데이터 사후 보정 
new_learning["knn_2"] = new_learning["knn"]

new_learning.loc[
    (new_learning["Model"] == "Tay") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Brand New") &
    (new_learning["knn_2"] > 102) & 
    (new_learning["knn_2"] < 105), "knn_2"] = 95

new_learning.loc[
    (new_learning["Model"] == "Tay") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Brand New") &
    (new_learning["knn_2"] > 95) & 
    (new_learning["knn_2"] < 100), "knn_2"] = 103

new_learning.loc[
    (new_learning["Model"] == "TayCT") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Pre-Owned") &
    (new_learning["knn_2"] > 120) & 
    (new_learning["knn_2"] < 126), "knn_2"] = 128

new_learning.loc[
    (new_learning["Model"] == "TayCT") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Pre-Owned") &
    (new_learning["knn_2"] > 127) & 
    (new_learning["knn_2"] < 130), "knn_2"] = 125

new_learning.loc[
    (new_learning["Model"] == "TayCT") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Brand New") &
    (new_learning["knn_2"] > 119) & 
    (new_learning["knn_2"] < 125), "knn_2"] = 130

new_learning.loc[
    (new_learning["Model"] == "TayCT") &
    (new_learning["Model_year"] == "y0") &
    (new_learning["Warranty"] == "w2") &
    (new_learning["Condition"] == "Brand New") &
    (new_learning["knn_2"] > 126) & 
    (new_learning["knn_2"] < 131), "knn_2"] = 121
# ----------------------------------------------------------------------------------------------------

In [246]:
# ----------------------------------------------------------------------------------------------------
# 학습 데이터 rmse 값 
rmse = np.sqrt(((new_learning["Price"] - new_learning["knn_2"])**2).mean())
print(f"RMSE: {rmse:.3f}")
# ----------------------------------------------------------------------------------------------------

RMSE: 1.553


In [247]:
# ----------------------------------------------------------------------------------------------------
# knn 예측값 사후 보정

# 검증 데이터 사후 보정 
new_validation["knn_2"] = new_validation["knn"]

new_validation.loc[
    (new_validation["Model"] == "Tay") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Brand New") &
    (new_validation["knn_2"] > 102) & 
    (new_validation["knn_2"] < 105), "knn_2"] = 95

new_validation.loc[
    (new_validation["Model"] == "Tay") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Brand New") &
    (new_validation["knn_2"] > 95) & 
    (new_validation["knn_2"] < 100), "knn_2"] = 103

new_validation.loc[
    (new_validation["Model"] == "TayCT") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Pre-Owned") &
    (new_validation["knn_2"] > 120) & 
    (new_validation["knn_2"] < 126), "knn_2"] = 128

new_validation.loc[
    (new_validation["Model"] == "TayCT") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Pre-Owned") &
    (new_validation["knn_2"] > 127) & 
    (new_validation["knn_2"] < 130), "knn_2"] = 125

new_validation.loc[
    (new_validation["Model"] == "TayCT") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Brand New") &
    (new_validation["knn_2"] > 119) & 
    (new_validation["knn_2"] < 125), "knn_2"] = 130

new_validation.loc[
    (new_validation["Model"] == "TayCT") &
    (new_validation["Model_year"] == "y0") &
    (new_validation["Warranty"] == "w2") &
    (new_validation["Condition"] == "Brand New") &
    (new_validation["knn_2"] > 126) & 
    (new_validation["knn_2"] < 131), "knn_2"] = 121
# ----------------------------------------------------------------------------------------------------

In [248]:
# ----------------------------------------------------------------------------------------------------
# 검증 데이터 rmse 값 
rmse = np.sqrt(((new_validation["Price"] - new_validation["knn_2"])**2).mean())
print(f"RMSE: {rmse:.3f}")
# ----------------------------------------------------------------------------------------------------

RMSE: 1.058


In [249]:
# ----------------------------------------------------------------------------------------------------
# new_learning 재정의 
new_learning = new_learning[
    ["Model","Manufacturer","Model_year","Drivetrain","Warranty",
     "Accident","Condition","Battery","Mileage","knn","knn_2","Price"]
]
visualizer.show_df_info(title="new_learning.info", df=new_learning)

# new_validation 재정의 
new_validation = new_validation[
    ["Model","Manufacturer","Model_year","Drivetrain","Warranty",
     "Accident","Condition","Battery","Mileage","knn","knn_2","Price"]
]
visualizer.show_df_info(title="new_validation.info", df=new_validation)
# ----------------------------------------------------------------------------------------------------


# Title: new_learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6747 entries, 0 to 6746
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         6747 non-null   object 
 1   Manufacturer  6747 non-null   object 
 2   Model_year    6747 non-null   object 
 3   Drivetrain    6747 non-null   object 
 4   Warranty      6747 non-null   object 
 5   Accident      6747 non-null   object 
 6   Condition     6747 non-null   object 
 7   Battery       6747 non-null   float64
 8   Mileage       6747 non-null   int64  
 9   knn           6747 non-null   float64
 10  knn_2         6747 non-null   float64
 11  Price         6747 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 632.7+ KB






# Title: new_validation.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 12 columns):
 # 

In [250]:
# ----------------------------------------------------------------------------------------------------
# new_learning으로 저장
new_learning.to_csv(path_or_buf="C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\2_new_learning.csv",
                    index=False)
# ----------------------------------------------------------------------------------------------------

In [251]:
# ----------------------------------------------------------------------------------------------------
# best_learning 생성 
best_learning = new_learning.iloc[best_train_idx,:]
# ----------------------------------------------------------------------------------------------------

In [252]:
# ----------------------------------------------------------------------------------------------------
# best_learning으로 저장
best_learning.to_csv(path_or_buf="C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\4_best_learning.csv",
                     index=False)
# ----------------------------------------------------------------------------------------------------

In [253]:
# ----------------------------------------------------------------------------------------------------
# new_validation으로 저장
new_validation.to_csv(path_or_buf="C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\d_validation\\2_new_validation.csv",
                    index=False)
# ----------------------------------------------------------------------------------------------------