In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
pd.set_option("display.max_columns", None)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,r2_score,mean_absolute_error,mean_squared_error
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost
import joblib
%matplotlib inline

In [3]:
df = pd.read_csv('usedcar_total_0608.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89890 entries, 0 to 89889
Data columns (total 32 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   car_url        89890 non-null  int64  
 1   car_area       89890 non-null  object 
 2   car_no         89890 non-null  object 
 3   car_brand      89890 non-null  object 
 4   nation         89890 non-null  object 
 5   car_model      89890 non-null  object 
 6   car_name       89890 non-null  object 
 7   name_detailed  89890 non-null  object 
 8   new_price      89890 non-null  int64  
 9   price          89890 non-null  int64  
 10  year           89890 non-null  int64  
 11  use            89890 non-null  int64  
 12  depreciation   89890 non-null  int64  
 13  mileage        89890 non-null  int64  
 14  fuel           89890 non-null  object 
 15  forecast_min   89890 non-null  int64  
 16  forecast_max   89890 non-null  int64  
 17  car_type       89890 non-null  object 
 18  car_cc

### 유튜브데이터와 연결성 확보를 위해서 'X'에서 'car_name'을 'car_model'로 통일시켜서 다시 진행했습니다

In [5]:
X = df[['use','nation','car_brand','car_model','mileage','year','car_type','fuel','trans','loss','flood','usage','change','insurance']] 
Y = df[['depreciation']]

In [6]:
X

Unnamed: 0,use,nation,car_brand,car_model,mileage,year,car_type,fuel,trans,loss,flood,usage,change,insurance
0,103,국산,한국GM,올란도,131493,2013,RV,LPG,오토,없음,없음,없음,1,있음
1,52,국산,한국GM,트랙스,60000,2018,SUV,디젤,오토,없음,없음,있음,2,없음
2,57,국산,한국GM,스파크,133319,2017,경차,가솔린,오토,없음,없음,없음,1,없음
3,48,국산,한국GM,크루즈,90000,2017,준중형,가솔린,오토,없음,없음,있음,1,있음
4,131,국산,한국GM,알페온,159474,2011,대형,가솔린,오토,없음,없음,없음,1,있음
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89885,188,국산,기아,카니발,196000,2006,RV,디젤,오토,없음,없음,없음,7,있음
89886,141,국산,기아,K5,83844,2011,중형,LPG,오토,없음,없음,없음,3,있음
89887,143,국산,기아,봉고,215000,2010,트럭,디젤,수동,없음,없음,없음,2,없음
89888,134,국산,기아,모하비,83386,2011,SUV,디젤,오토,없음,없음,없음,2,없음


In [7]:
encoded_X = pd.get_dummies(data = X, columns = ['nation','car_brand','car_model','car_type','fuel','trans','loss','flood','usage','insurance'])

In [8]:
scaler = joblib.load('s_scale_0608.pkl')
scaled_X= scaler.transform(encoded_X)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [9]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X,Y, random_state = 5, test_size = 0.25)

In [10]:
xgb_reg = xgboost.XGBRegressor(learning_rate = 0.5,
                               n_estimators = 200)

In [None]:
xgb_reg.fit(X_train, np.ravel(y_train))

In [None]:
# 모델명 변경해주기

xgb_reg_predict = xgb_reg.predict(X_test)

In [None]:
# 모델명, acc.append 부분 model 이름 변경해주기

print(f'train acc : {xgb_reg.score(X_train, y_train)}')
print(f'test acc : {xgb_reg.score(X_test, y_test)}')
print(f'MAE : {mean_absolute_error(y_test, xgb_reg_predict)}')
print(f'MSE : {mean_squared_error(y_test, xgb_reg_predict)}')

In [None]:
plt.figure(figsize=(8, 8))

ax1 = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(xgb_reg_predict, hist=False, color="b", label="Predict Values" , ax=ax1)

plt.title('Actual vs Predict Values for Depreciation ')
plt.xlabel('Usedcar Depreciation  rate')
plt.ylabel('Proportion of Cars')

plt.show()
plt.close()

## 모델저장하고 불러오기

In [None]:
joblib.dump(xgb_reg, 'xgb_reg_0608.pkl')

In [None]:
encoded_X.loc[:0]

In [None]:
searh_car = encoded_X.loc[:0]
searh_car

In [None]:
searh_car.to_csv('search_vector_0608.csv', index = False)

## 서비스 구현을 위한 입력변수 받아서 예측하기

In [None]:
encoded_X.loc[:1]

In [None]:
searh_car = encoded_X.loc[:0]
searh_car

In [None]:
searh_car.info()

In [None]:
for i,column in enumerate(searh_car.columns):
    if i <= 3 :
        searh_car[column][0] = 0
    else :
        searh_car[column][0] = np.uint8(0)

In [None]:
searh_car.info()

## 비교를 위한 하나의 예시 가져오기

In [None]:
print(X_test[66])
print(y_test.iloc[66])

In [None]:
df.loc[40604:40604]

## 변수 입력받기

In [None]:
new_car = int(input('신차가격을 숫자만 입력해주세요(단위 : 만원)(ex 7000) :')) # 최종감가상각률에 잔존가치 금액으로 반환하기 위함
use = int(input('사용개월 수를 숫자만 작성해주세요(ex 1년 >> 12) : ')) 
mileage = int(input('주행거리를 숫자만 입력해주세요(단위 : km)(ex 50000) : '))
year = int(input('차량의 연식을 4자리 형태의 숫자만 입력해주세요(ex 2018) : '))
change = int(input('소유주 변경횟수를 숫자만 입력해주세요(ex 3) : '))


nation  = (input(f'{df.nation.unique()} 에서 선택해주세요 : '))
car_brand = input(f"{df[df['nation'] == nation]['car_brand'].unique()} 에서 선택해주세요 : ")
car_model = input(f"{df[df['car_brand'] == car_brand]['car_model'].unique()} 에서 선택해주세요 : ")
car_type = input(f'{df.car_type.unique()} 에서 선택해주세요 : ')
fuel = input(f'{df.fuel.unique()} 에서 선택해주세요 : ')
trans = input(f'{df.trans.unique()} 에서 선택해주세요 : ')
loss = input(f'전손이력을 {df.loss.unique()} 에서 선택해주세요 : ')
flood = input(f'침수이력을 {df.flood.unique()} 에서 선택해주세요 : ')
usage = input(f'용도이력을 {df.usage.unique()} 에서 선택해주세요 : ')
insurance = input(f'보험사고정보를 {df.insurance.unique()} 에서 선택해주세요 : ')

In [None]:
searh_car['use'] = use
searh_car['mileage'] = mileage
searh_car['year'] = year
searh_car['change'] = change

searh_car[f'nation_{nation}'] = np.uint8(1)
searh_car[f'car_brand_{car_brand}'] = np.uint8(1)
searh_car[f'car_model_{car_model}'] = np.uint8(1)
searh_car[f'car_type_{car_type}'] = np.uint8(1)
searh_car[f'fuel_{fuel}'] = np.uint8(1)
searh_car[f'trans_{trans}'] = np.uint8(1)
searh_car[f'loss_{loss}'] = np.uint8(1)
searh_car[f'flood_{flood}'] = np.uint8(1)
searh_car[f'usage_{usage}'] = np.uint8(1)
searh_car[f'insurance_{insurance}'] = np.uint8(1)

In [None]:
# change(사용자변경회수)
selected = 0
for col in sample.columns:
    if searh_car[col][0] != 0:
        selected += 1
        print(col)
print(selected)

In [None]:
scaled_searh_car= s_scale.transform(searh_car)

In [None]:
# Vectorize 동등함
(scaled_searh_car == [X_test[66]]).min()

In [None]:
model

In [None]:
model_predict = model.predict(scaled_searh_car)
model_dep = round(model_predict[0].astype('float64'),1)
moel_dep = model_dep - 4
print(f'Model Depreciation Predict : {model_dep}% 감가')

print(f"Actual Depreciation : {df['depreciation'][40604]}% 감가")

In [None]:
model_price = [math.trunc(new_car * (100 - (int(math.ceil(model_dep))))/100), math.ceil(new_car * (100 - (int(math.trunc(model_dep))))/100)]

print(f'XGBoost Usedcar Price Predict : {model_price[0]} ~ {model_price[1]} 만원입니다.')
print(f"KB차차차 Usedcar Price Predict : {df['forecast_min'][40604]} ~ {df['forecast_max'][40604]} 만원입니다.")
print(f"Actual Usedcar Price : {df['price'][40604]} 만원입니다.")