In [25]:
import numpy as np
import pandas as pd

In [26]:
import torch
import torch.optim as optim
from sklearn import preprocessing
# for softmax
import torch.nn.functional as F  
# For reproducibility
torch.manual_seed(1)

<torch._C.Generator at 0x293bfb5d7d0>

In [27]:
train_df = pd.read_csv("./재배환경별작물종류예측하기_data/train.csv")
test_df = pd.read_csv("./재배환경별작물종류예측하기_data/test.csv")
submit_df = pd.read_csv("./재배환경별작물종류예측하기_data/sample.csv")
print(train_df.head())
print(test_df.head())
print(submit_df.head())

    N    P    K  temperature   humidity        ph    rainfall        label
0  22   36   16    30.581395  50.771481  8.184229   64.585596    mothbeans
1  18   27   41    22.365094  92.308824  7.175344  104.821633  pomegranate
2  61   68   50    35.214628  91.497251  6.793245  243.074507       papaya
3  26   72   22    28.767949  37.577921  4.674942   91.720849   pigeonpeas
4   2  140  197    22.697801  92.822234  5.534567  105.050823        apple
     N    P    K  temperature   humidity        ph    rainfall
0  101   17   47    29.494014  94.729813  6.185053   26.308209
1   98    8   51    26.179346  86.522581  6.259336   49.430510
2   59   62   49    43.360515  93.351916  6.941497  114.778071
3   44   60   55    34.280461  90.555616  6.825371   98.540477
4   30  137  200    22.914300  90.704756  5.603413  118.604465
   id   label
0   0  coffee
1   1  coffee
2   2  coffee
3   3  coffee
4   4  coffee


In [28]:
x_train = torch.FloatTensor(np.array([train_df['N'],train_df['P'],train_df['K'],train_df['temperature'],train_df['humidity'],train_df['ph'],train_df['rainfall']]).T)
## preprocessing LabelEncoding "label"
le = preprocessing.LabelEncoder()
le.fit(train_df['label'])
train_df['label'] = le.transform(train_df['label'])
y_train = torch.LongTensor(train_df['label'])
print(x_train.shape)
print(y_train.shape)

torch.Size([1650, 7])
torch.Size([1650])


## 모델 학습

In [29]:
# 모델 초기화
nb_class = len(le.classes_)
nb_data = len(y_train)

W = torch.zeros((7, nb_class), requires_grad=True)
b = torch.zeros(1, requires_grad=True)
# optimizer 설정
optimizer = optim.SGD([W, b], lr=1e-2)
nb_epochs = 10000
for epoch in range(nb_epochs + 1):
    # Cost 계산 (1)
    hypothesis = F.softmax(x_train.matmul(W) + b, dim=1) # or .mm or @
    
    # cost 표현번 1번 예시
    y_one_hot = torch.zeros(nb_data, nb_class)
    # unsqueeze -> 리스트의 각 원소를 각 리스트로 변환
    y_one_hot.scatter_(1, y_train.unsqueeze(1), 1)
    cost = (y_one_hot * -torch.log(F.softmax(hypothesis, dim=1))).sum(dim=1).mean()
    # cost 표현법 2번 예시
    # cross_entropy를 사용하면 scatter 함수를 이용한 one_hot_encoding을 안해도 됨
    # cost = F.cross_entropy((x_train.matmul(W) + b), y_train)
    # cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    # 100번마다 로그 출력
    if epoch % 1000 == 0:
        print('Epoch {:4d}/{} Cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()
        ))

Epoch    0/10000 Cost: 3.091043
Epoch 1000/10000 Cost: 2.630044
Epoch 2000/10000 Cost: 2.623802
Epoch 3000/10000 Cost: 2.621339
Epoch 4000/10000 Cost: 2.619949
Epoch 5000/10000 Cost: 2.591134
Epoch 6000/10000 Cost: 2.584429
Epoch 7000/10000 Cost: 2.582519
Epoch 8000/10000 Cost: 2.581298
Epoch 9000/10000 Cost: 2.580391
Epoch 10000/10000 Cost: 2.579711


In [35]:
# 학습된 W,b를 통한 클래스 예측
hypothesis = F.softmax(x_train.matmul(W) + b, dim=1) # or .mm or @
#argmax -> 리스트 중 제일 큰 값의 인덱스로 변환 
predict = torch.argmax(hypothesis, dim=1)
print(hypothesis)
print(predict)
print(y_train)
# 정확도 계산 
correct_prediction = predict.float() == y_train
print(correct_prediction)
accuracy = correct_prediction.sum().item() / len(correct_prediction)
print('The model has an accuracy of {:2.2f}% for the training set.'.format(accuracy * 100))

tensor([[4.1520e-08, 5.3696e-07, 1.8948e-03,  ..., 7.2870e-08, 3.9373e-06,
         5.5002e-08],
        [8.3358e-11, 3.0264e-05, 2.5306e-10,  ..., 2.6663e-10, 1.1374e-03,
         1.0734e-10],
        [1.5957e-20, 1.5769e-07, 6.2575e-17,  ..., 1.0238e-19, 8.1013e-01,
         2.0010e-20],
        ...,
        [1.3776e-10, 3.0775e-06, 2.6454e-08,  ..., 2.6785e-10, 2.3566e-04,
         1.0367e-10],
        [1.3423e-30, 8.6118e-04, 0.0000e+00,  ..., 7.3579e-32, 8.6850e-35,
         6.5244e-32],
        [3.6234e-11, 3.5434e-09, 1.0286e-02,  ..., 3.3224e-11, 4.8483e-13,
         3.6235e-11]], grad_fn=<SoftmaxBackward>)
tensor([13, 16, 20,  ..., 12,  7, 10])
tensor([13, 19, 17,  ..., 12,  7, 10])
tensor([ True, False, False,  ...,  True,  True,  True])
The model has an accuracy of 58.91% for the training set.


In [36]:
x_test = torch.FloatTensor(np.array([test_df['N'],test_df['P'],test_df['K'],test_df['temperature'],test_df['humidity'],test_df['ph'],test_df['rainfall']]).T)

In [39]:
hypothesis = F.softmax(x_test.matmul(W) + b, dim=1) # or .mm or @
#argmax -> 리스트 중 제일 큰 값의 인덱스로 변환 
predict = torch.argmax(hypothesis, dim=1)

In [40]:
submit_df['label'] = le.inverse_transform(predict)
submit_df.head()

Unnamed: 0,id,label
0,0,muskmelon
1,1,muskmelon
2,2,mango
3,3,mango
4,4,chickpea


In [42]:
submit_df.to_csv("./재배환경별작물종류예측하기_data/sample.csv",index=False)