# 딜러 팀

In [1]:
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn

In [3]:
train = pd.read_csv('./hyundai_train.csv')
test = pd.read_csv('./hyundai_test.csv')

train.head()
test.head()

Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,km,fuelType,tax,kpl,engineSize
0,1331,I30,2019,24552572,Manual,2833,Petrol,197925,16,1.0
1,2071,Ioniq,2018,24566220,Automatic,25360,Hybrid,184275,27,1.6
2,1489,Tucson,2017,15009960,Manual,58513,Diesel,40950,21,1.7
3,1396,Tucson,2016,16077226,Semi-Auto,107038,Diesel,170625,20,1.7
4,3873,I10,2018,10904672,Manual,50200,Petrol,204750,21,1.0


Unnamed: 0.1,Unnamed: 0,model,year,price,transmission,km,fuelType,tax,kpl,engineSize
0,3339,I10,2020,19079764,Semi-Auto,1920,Petrol,197925,18,1.2
1,1152,Tucson,2018,22901176,Semi-Auto,53648,Petrol,197925,14,1.6
2,68,I10,2014,7506345,Manual,32203,Petrol,27300,21,1.0
3,2914,Tucson,2018,19100236,Manual,40472,Petrol,197925,15,1.6
4,1667,Santa Fe,2020,50483582,Semi-Auto,1576,Diesel,204750,13,2.2


여기서 불필요한 열 제거
맨 앞 열은 train test셋을 나누기 이전의 index들 이므로 제거
tax또한 가격측정에 불필요하다고 생각되어 제거하였다.

In [4]:
train.drop(columns=['tax'], inplace=True)
test.drop(columns=['tax'], inplace=True)
train.drop(columns=['Unnamed: 0'], inplace=True)
test.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
train.head()
test.head()

Unnamed: 0,model,year,price,transmission,km,fuelType,kpl,engineSize
0,I30,2019,24552572,Manual,2833,Petrol,16,1.0
1,Ioniq,2018,24566220,Automatic,25360,Hybrid,27,1.6
2,Tucson,2017,15009960,Manual,58513,Diesel,21,1.7
3,Tucson,2016,16077226,Semi-Auto,107038,Diesel,20,1.7
4,I10,2018,10904672,Manual,50200,Petrol,21,1.0


Unnamed: 0,model,year,price,transmission,km,fuelType,kpl,engineSize
0,I10,2020,19079764,Semi-Auto,1920,Petrol,18,1.2
1,Tucson,2018,22901176,Semi-Auto,53648,Petrol,14,1.6
2,I10,2014,7506345,Manual,32203,Petrol,21,1.0
3,Tucson,2018,19100236,Manual,40472,Petrol,15,1.6
4,Santa Fe,2020,50483582,Semi-Auto,1576,Diesel,13,2.2


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3888 entries, 0 to 3887
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         3888 non-null   object 
 1   year          3888 non-null   int64  
 2   price         3888 non-null   int64  
 3   transmission  3888 non-null   object 
 4   km            3888 non-null   int64  
 5   fuelType      3888 non-null   object 
 6   kpl           3888 non-null   int64  
 7   engineSize    3888 non-null   float64
dtypes: float64(1), int64(4), object(3)
memory usage: 243.1+ KB


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 972 entries, 0 to 971
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         972 non-null    object 
 1   year          972 non-null    int64  
 2   price         972 non-null    int64  
 3   transmission  972 non-null    object 
 4   km            972 non-null    int64  
 5   fuelType      972 non-null    object 
 6   kpl           972 non-null    int64  
 7   engineSize    972 non-null    float64
dtypes: float64(1), int64(4), object(3)
memory usage: 60.9+ KB


각 데이터셋들을 확인해본 결과 모델, transmission, fuelType등이 object형식이므로 이를 바꿔줘야한다.

In [8]:
train['model'].unique()
train['transmission'].unique()
train['fuelType'].unique()

array([' I30', ' Ioniq', ' Tucson', ' I10', ' IX20', ' Kona', ' IX35',
       ' Accent', ' I20', ' Santa Fe', ' I800', ' I40', ' Terracan',
       ' Getz', ' Veloster', ' Amica'], dtype=object)

array(['Manual', 'Automatic', 'Semi-Auto', 'Other'], dtype=object)

array(['Petrol', 'Hybrid', 'Diesel', 'Other'], dtype=object)

In [9]:
train['price'].max()
test['price'].max()

60043936

125560680

각 열을 확인해본 결과 특정 값들로만 이루어져 있으므로 mapping을 통해 문자열이 아닌 숫자들로 변환을 해준다.

또한 price의 값을 예측하고싶은데 예측 범위가 너무 커 nn 모델의 output노드의 갯수가 1억개를 넘게 된다. 그러므로 천만원 단위로 나누는 과정을 수행하여 가격예측을 천만원단위로 수행한다.

In [10]:
model_mapping = {" I30": 1, " Ioniq": 2, " Tucson": 3, " I10": 4, " IX20": 5, " Kona": 6, " IX35": 7, 
                 " Accent": 8, " I20": 9, " Santa Fe": 10, " I800": 11, " I40": 12, " Terracan": 13, 
                 " Getz": 14, " Veloster": 15, " Amica": 16}
trans_mapping = {"Manual": 1, "Automatic": 2, "Semi-Auto": 3, "Other": 4}
fuelT_mapping = {"Petrol": 1, "Hybrid": 2, "Diesel": 3, "Other": 4}

train_test_data= [train, test] 

for dataset in train_test_data:
    dataset['model'] = dataset['model'].map(model_mapping)
    dataset['transmission'] = dataset['transmission'].map(trans_mapping)
    dataset['fuelType'] = dataset['fuelType'].map(fuelT_mapping)
    dataset['price'] = dataset['price']//10000000

In [11]:
train['price'].max()
test['price'].max()

6

12

이제 train데이터 셋과 test 데이터 셋에서 target을 분류해준다.

In [12]:
train_label = train['price']
train_label.head()
test_label = test['price']
test_label.head()

0    2
1    2
2    1
3    1
4    1
Name: price, dtype: int64

0    1
1    2
2    0
3    1
4    5
Name: price, dtype: int64

train 에는 price가 없어야 하므로 price 열 제거

In [13]:
train.drop(columns=['price'], inplace=True)
train
test.drop(columns=['price'], inplace=True)
test

Unnamed: 0,model,year,transmission,km,fuelType,kpl,engineSize
0,1,2019,1,2833,1,16,1.0
1,2,2018,2,25360,2,27,1.6
2,3,2017,1,58513,3,21,1.7
3,3,2016,3,107038,3,20,1.7
4,4,2018,1,50200,1,21,1.0
...,...,...,...,...,...,...,...
3883,3,2017,1,45598,3,21,1.7
3884,3,2016,1,57747,1,15,1.6
3885,3,2016,1,46747,3,16,2.0
3886,3,2017,1,43956,3,21,1.7


Unnamed: 0,model,year,transmission,km,fuelType,kpl,engineSize
0,4,2020,3,1920,1,18,1.2
1,3,2018,3,53648,1,14,1.6
2,4,2014,1,32203,1,21,1.0
3,3,2018,1,40472,1,15,1.6
4,10,2020,3,1576,3,13,2.2
...,...,...,...,...,...,...,...
967,3,2017,1,31040,3,21,1.7
968,9,2016,1,48264,1,19,1.2
969,3,2018,1,29476,1,12,1.6
970,1,2018,1,26451,1,20,1.0


input으로는 총 7가지의 값들이 들어오니 input size 7
price의 최대값이 12 이므로 총 13가지의 경우의 수가 나온다. 그러므로 num_classes 즉 아웃풋의 결과는 13

In [14]:
input_size = 7
hidden_size = 100
num_classes = 13
num_epochs = 100
learning_rate = 0.001

In [15]:
class NeuralNet(nn.Module): 
    
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    # 모델의 Forward Path를 정의
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
    
        return out

In [16]:
X_train = torch.FloatTensor(train.values)
y_train = torch.LongTensor(train_label.values)
X_test = torch.FloatTensor(test.values)
y_test = torch.LongTensor(test_label.values)

In [17]:
X_train
y_train

tensor([[1.0000e+00, 2.0190e+03, 1.0000e+00,  ..., 1.0000e+00, 1.6000e+01,
         1.0000e+00],
        [2.0000e+00, 2.0180e+03, 2.0000e+00,  ..., 2.0000e+00, 2.7000e+01,
         1.6000e+00],
        [3.0000e+00, 2.0170e+03, 1.0000e+00,  ..., 3.0000e+00, 2.1000e+01,
         1.7000e+00],
        ...,
        [3.0000e+00, 2.0160e+03, 1.0000e+00,  ..., 3.0000e+00, 1.6000e+01,
         2.0000e+00],
        [3.0000e+00, 2.0170e+03, 1.0000e+00,  ..., 3.0000e+00, 2.1000e+01,
         1.7000e+00],
        [4.0000e+00, 2.0190e+03, 1.0000e+00,  ..., 1.0000e+00, 2.0000e+01,
         1.2000e+00]])

tensor([2, 2, 1,  ..., 2, 1, 1])

In [18]:
model = NeuralNet(input_size, hidden_size, num_classes)

loss_function = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

loss_list = []

for epoch in range(num_epochs):

    pred = model(X_train)
    loss = loss_function(pred, y_train)
    
    loss_list.append(loss.item())

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch : [{epoch + 1}/{num_epochs}], Loss : {loss.item():.4f}")

Epoch : [10/100], Loss : 679.4996
Epoch : [20/100], Loss : 349.3359
Epoch : [30/100], Loss : 116.6691
Epoch : [40/100], Loss : 94.7312
Epoch : [50/100], Loss : 102.1063
Epoch : [60/100], Loss : 51.3146
Epoch : [70/100], Loss : 73.7595
Epoch : [80/100], Loss : 38.7705
Epoch : [90/100], Loss : 24.6193
Epoch : [100/100], Loss : 27.8777


In [19]:
with torch.no_grad():
    correct = 0
    total = 0

    for features, labels in zip(X_test, y_test):
        outputs = model(features)
        total += 1
        correct += (torch.argmax(outputs) == labels).sum().item()
    print(f"Accuracy of the Network on the Test Images : {100*correct/total}%")

Accuracy of the Network on the Test Images : 51.02880658436214%


여러가지 hidden node의 갯수와 learning_rate를 확인해 본 결과 learning_rate가 여기서 더 작아지면 30퍼센트정도로 정확도가 많이 낮아지고 hideen node의 갯수의 변화에 따라 정확도는 크게 달라지는것을 확인 못하였다.

우리가 여러번의 수행결과 지금의 결과가 최고수치였다.