In [1]:
# ----------------------------------------------------------------------------------------------------
# 라이브러리 목록

# 기본 라이브러리 
import numpy as np
import pandas as pd

# sklearn 라이브러리
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsRegressor

# 개인 라이브러리 
from preparation_for_analysis.show_window import DataVisualizer
from preparation_for_analysis.encoding import DataFrameEncoder

# pandas 설정
pd.set_option('display.max_rows', None)  # 모든 행 출력
pd.set_option('display.max_columns', None)  # 모든 열 출력

# preparation_for_analysis 설정
visualizer = DataVisualizer(line="=", length=100, start="#")
# ----------------------------------------------------------------------------------------------------

In [4]:
# ----------------------------------------------------------------------------------------------------
# best_learning 불러오기 
best_learning = pd.read_csv("C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\4_best_learning.csv",
                       sep=",",
                       header=0)

visualizer.show_df_info(title="best_learning.info", df=best_learning)
# ----------------------------------------------------------------------------------------------------


# Title: best_learning.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Model         5398 non-null   object 
 1   Manufacturer  5398 non-null   object 
 2   Model_year    5398 non-null   object 
 3   Drivetrain    5398 non-null   object 
 4   Warranty      5398 non-null   object 
 5   Accident      5398 non-null   object 
 6   Condition     5398 non-null   object 
 7   Battery       5398 non-null   float64
 8   Mileage       5398 non-null   int64  
 9   knn           5398 non-null   float64
 10  knn_2         5398 non-null   float64
 11  Price         5398 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 506.2+ KB







In [5]:
# ----------------------------------------------------------------------------------------------------
# ("Price"컬럼 log(y+1)변환)
# learning_encoding 구성하기

# onehot 인코더 호출
best_learning_encoder = DataFrameEncoder(
    df=best_learning,
    columns=[
        "Model","Manufacturer","Model_year","Drivetrain",
        "Warranty","Accident","Condition"
    ],
    ascending_order=[True,True,True,
                     True,True,True,True],
    sort_by_number=[False,False,True,
                    False,True,False,False]
)

# onehot 인코딩 변환 / best_learning_encoding 생성 
best_learning_encoding = best_learning_encoder.fit_transform(encoding_type="onehot")

# MinMax Normalization 변수 
battery_max = best_learning["Battery"].max()
battery_min = best_learning["Battery"].min()
mileage_max = best_learning["Mileage"].max()
mileage_min = best_learning["Mileage"].min()
knn_max = best_learning["knn"].max()
knn_min = best_learning["knn"].min()
knn_2_max = best_learning["knn_2"].max()
knn_2_min = best_learning["knn_2"].min()

# MinMax Normalization 
best_learning_encoding["Battery"] = best_learning["Battery"].apply(lambda x : (x-battery_min)/(battery_max-battery_min))
best_learning_encoding["Mileage"] = best_learning["Mileage"].apply(lambda x : (x-mileage_min)/(mileage_max-mileage_min))
best_learning_encoding["knn"] = best_learning["knn"].apply(lambda x : (x-knn_min)/(knn_max-knn_min))
best_learning_encoding["knn_2"] = best_learning["knn_2"].apply(lambda x : (x-knn_2_min)/(knn_2_max-knn_2_min))

# log(y+1) 변환 
best_learning_encoding["Price"] = best_learning["Price"].apply(lambda x : np.log1p(x))

# best_learning_encoding "float32"로 전환  
best_learning_encoding = best_learning_encoding.astype("float32")

# best_learning_encoding 정보 확인
visualizer.show_df_info(title="best_learning_encoding.info",df=best_learning_encoding)
# ----------------------------------------------------------------------------------------------------


# Title: best_learning_encoding.info


# DataFrame Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5398 entries, 0 to 5397
Data columns (total 55 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   EV6         5398 non-null   float32
 1   ID4         5398 non-null   float32
 2   ION5        5398 non-null   float32
 3   ION6        5398 non-null   float32
 4   IONIQ       5398 non-null   float32
 5   KNE         5398 non-null   float32
 6   M3          5398 non-null   float32
 7   MS          5398 non-null   float32
 8   MX          5398 non-null   float32
 9   MY          5398 non-null   float32
 10  Niro        5398 non-null   float32
 11  Q4eT        5398 non-null   float32
 12  RSeTGT      5398 non-null   float32
 13  Soul        5398 non-null   float32
 14  Tay         5398 non-null   float32
 15  TayCT       5398 non-null   float32
 16  TayGTS      5398 non-null   float32
 17  eT          5398 non-null   float32
 18  i3

In [6]:
# ----------------------------------------------------------------------------------------------------
# best_learning_encoding 저장하기
best_learning_encoding.to_csv("C:\\Users\\ssalt\\Documents\\ev_price_predict_project\\data\\train\\A_df\\c_learning\\5_best_learning_encoding.csv",
                              index=False)
# ----------------------------------------------------------------------------------------------------