In [1]:
# ----------------------------------------------------------------------------------------------------
# 라이브러리 목록

# 기본 라이브러리 
import numpy as np
import pandas as pd

# 개인 라이브러리 
from preparation_for_analysis.show_window import DataVisualizer
from preparation_for_analysis.encoding import DataFrameEncoder

# pandas 설정
pd.set_option('display.max_rows', None)  # 모든 행 출력
pd.set_option('display.max_columns', None)  # 모든 열 출력

# preparation_for_analysis 설정
visualizer = DataVisualizer(line="=", length=100, start="#")
# ----------------------------------------------------------------------------------------------------

In [2]:
# ----------------------------------------------------------------------------------------------------
# best_learning 불러오기 
best_learning = pd.read_csv("C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\4_best_learning.csv",
                       sep=",",
                       header=0)

visualizer.show_df_info(title="best_learning.info", df=best_learning)
# ----------------------------------------------------------------------------------------------------


# Title: best_learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         5398 non-null   object 
 1   Manufacturer  5398 non-null   object 
 2   Model_year    5398 non-null   object 
 3   Drivetrain    5398 non-null   object 
 4   Warranty      5398 non-null   object 
 5   Accident      5398 non-null   object 
 6   Condition     5398 non-null   object 
 7   Battery       5398 non-null   float64
 8   Mileage       5398 non-null   int64  
 9   knn           5398 non-null   float64
 10  knn_2         5398 non-null   float64
 11  Price         5398 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 506.2+ KB







In [3]:
# ----------------------------------------------------------------------------------------------------
# new_validation 불러오기 
new_validation = pd.read_csv(
    "C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\d_validation\\2_new_validation.csv",
    sep=",",
    header=0
)

visualizer.show_df_info(title="new_validation.info", df=new_validation)
# ---------------------------------------------------------------------------------------------------


# Title: new_validation.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         750 non-null    object 
 1   Manufacturer  750 non-null    object 
 2   Model_year    750 non-null    object 
 3   Drivetrain    750 non-null    object 
 4   Warranty      750 non-null    object 
 5   Accident      750 non-null    object 
 6   Condition     750 non-null    object 
 7   Battery       750 non-null    float64
 8   Mileage       750 non-null    int64  
 9   knn           750 non-null    float64
 10  knn_2         750 non-null    float64
 11  Price         750 non-null    float64
dtypes: float64(4), int64(1), object(7)
memory usage: 70.4+ KB







In [4]:
# ----------------------------------------------------------------------------------------------------
# ("Price"컬럼 log(y+1)변환)
# new_validation_encoding 구성하기

# onehot 인코더 호출
# onehot 인코더 호출
new_validation_encoder = DataFrameEncoder(
    df=new_validation,
    columns=[
        "Model","Manufacturer","Model_year","Drivetrain",
        "Warranty","Accident","Condition"
    ],
    ascending_order=[True,True,True,
                     True,True,True,True],
    sort_by_number=[False,False,True,
                    False,True,False,False]
)

# onehot 인코딩 변환 / validation_encoding 생성 
new_validation_encoding = new_validation_encoder.fit_transform(encoding_type="onehot")

# MinMax Normalization 변수 
battery_max = best_learning["Battery"].max()
battery_min = best_learning["Battery"].min()
mileage_max = best_learning["Mileage"].max()
mileage_min = best_learning["Mileage"].min()
knn_max = best_learning["knn"].max()
knn_min = best_learning["knn"].min()
knn_2_max = best_learning["knn_2"].max()
knn_2_min = best_learning["knn_2"].min()

# MinMax Normalization 
new_validation_encoding["Battery"] = new_validation["Battery"].apply(lambda x : (x-battery_min)/(battery_max-battery_min))
new_validation_encoding["Mileage"] = new_validation["Mileage"].apply(lambda x : (x-mileage_min)/(mileage_max-mileage_min))
new_validation_encoding["knn"] = new_validation["knn"].apply(lambda x : (x-knn_min)/(knn_max-knn_min))
new_validation_encoding["knn_2"] = new_validation["knn_2"].apply(lambda x : (x-knn_2_min)/(knn_2_max-knn_2_min))

# log(y+1) 변환 
new_validation_encoding["Price"] = new_validation["Price"].apply(lambda x : np.log1p(x))

# validation_encoding "float32"로 전환  
new_validation_encoding = new_validation_encoding.astype("float32")

# new_validation_encoding 정보 확인
visualizer.show_df_info(title="new_validation_encoding.info",df=new_validation_encoding)
# ----------------------------------------------------------------------------------------------------


# Title: new_validation_encoding.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 55 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EV6         750 non-null    float32
 1   ID4         750 non-null    float32
 2   ION5        750 non-null    float32
 3   ION6        750 non-null    float32
 4   IONIQ       750 non-null    float32
 5   KNE         750 non-null    float32
 6   M3          750 non-null    float32
 7   MS          750 non-null    float32
 8   MX          750 non-null    float32
 9   MY          750 non-null    float32
 10  Niro        750 non-null    float32
 11  Q4eT        750 non-null    float32
 12  RSeTGT      750 non-null    float32
 13  Soul        750 non-null    float32
 14  Tay         750 non-null    float32
 15  TayCT       750 non-null    float32
 16  TayGTS      750 non-null    float32
 17  eT          750 non-null    float32
 18  i3 

In [5]:
# ----------------------------------------------------------------------------------------------------
# best_validation_encoding 저장하기
new_validation_encoding.to_csv("C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\d_validation\\4_best_validation_encoding.csv",
                               index=False)
# ----------------------------------------------------------------------------------------------------