In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ai-w4p2/sample.csv
/kaggle/input/2022-ai-w4p2/train.csv
/kaggle/input/2022-ai-w4p2/test.csv


In [2]:
import random
import torch
import torch.optim as optim
import torch.nn as nn

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [3]:
train = pd.read_csv('../input/2022-ai-w4p2/train.csv')
train

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,22,36,16,30.581395,50.771481,8.184229,64.585596,mothbeans
1,18,27,41,22.365094,92.308824,7.175344,104.821633,pomegranate
2,61,68,50,35.214628,91.497251,6.793245,243.074507,papaya
3,26,72,22,28.767949,37.577921,4.674942,91.720849,pigeonpeas
4,2,140,197,22.697801,92.822234,5.534567,105.050823,apple
...,...,...,...,...,...,...,...,...
1645,10,5,5,21.213070,91.353492,7.817846,112.983436,orange
1646,108,94,47,27.359116,84.546250,6.387431,90.812505,banana
1647,11,36,31,27.920633,51.779659,6.475449,100.258567,mango
1648,11,124,204,13.429886,80.066340,6.361141,71.400430,grapes


In [4]:
x_train = train.iloc[:,:-1]
y_train = train.iloc[:,-1]

In [5]:
print(x_train.shape)
print(y_train.shape)

(1650, 7)
(1650,)


# label 개수 파악하기

In [6]:
y_train.value_counts()

lentil         85
orange         81
grapes         81
watermelon     79
cotton         79
muskmelon      78
banana         77
maize          77
blackgram      76
kidneybeans    76
rice           75
coffee         74
mothbeans      74
mungbean       74
coconut        73
mango          73
jute           73
apple          71
pomegranate    70
chickpea       69
papaya         69
pigeonpeas     66
Name: label, dtype: int64

In [7]:
np.shape(y_train.unique())

(22,)

# 데이터 정규화

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_train

array([[-7.74026344e-01, -5.22604462e-01, -6.28724908e-01, ...,
        -9.51256909e-01,  2.19982609e+00, -6.86012983e-01],
       [-8.81256962e-01, -7.93767155e-01, -1.37942559e-01, ...,
         9.39856699e-01,  9.02028907e-01,  4.18192366e-02],
       [ 2.71472181e-01,  4.41529556e-01,  3.87390868e-02, ...,
         9.02907360e-01,  4.10508789e-01,  2.54268422e+00],
       ...,
       [-1.06891054e+00, -5.22604462e-01, -3.34255499e-01, ...,
        -9.05356554e-01,  1.70580458e-03, -4.07223532e-02],
       [-1.06891054e+00,  2.12876409e+00,  3.06195836e+00, ...,
         3.82480441e-01, -1.45336714e-01, -5.62739020e-01],
       [-5.05949799e-01,  7.42821437e-01, -5.10937144e-01, ...,
        -4.23855858e-01,  6.85524273e-01, -8.88186049e-01]])

# X: tensor로 바꾸기

In [9]:
x_train = torch.FloatTensor(x_train)
x_train

tensor([[-7.7403e-01, -5.2260e-01, -6.2872e-01,  ..., -9.5126e-01,
          2.1998e+00, -6.8601e-01],
        [-8.8126e-01, -7.9377e-01, -1.3794e-01,  ...,  9.3986e-01,
          9.0203e-01,  4.1819e-02],
        [ 2.7147e-01,  4.4153e-01,  3.8739e-02,  ...,  9.0291e-01,
          4.1051e-01,  2.5427e+00],
        ...,
        [-1.0689e+00, -5.2260e-01, -3.3426e-01,  ..., -9.0536e-01,
          1.7058e-03, -4.0722e-02],
        [-1.0689e+00,  2.1288e+00,  3.0620e+00,  ...,  3.8248e-01,
         -1.4534e-01, -5.6274e-01],
        [-5.0595e-01,  7.4282e-01, -5.1094e-01,  ..., -4.2386e-01,
          6.8552e-01, -8.8819e-01]])

# y 문자 -> 숫자로 바꿔주기

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_train_encoded

array([13, 19, 17, ..., 12,  7, 10])

# 원핫인코딩

In [11]:
y_train_encoded = torch.LongTensor(y_train_encoded)
y_train_encoded

tensor([13, 19, 17,  ..., 12,  7, 10])

In [12]:
nb_class = 22
nb_data = len(y_train_encoded)

y_one_hot = torch.zeros(nb_data,nb_class)
y_one_hot.scatter_(1, y_train_encoded.unsqueeze(1), 1)

print(y_one_hot)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


# 모델 학습하기

In [13]:
#for softmax
import torch.nn.functional as F

#모델 초기화
#feature = 7, class = 22
nb_class = 22
nb_data = len(y_train)

W = torch.zeros([7,nb_class],requires_grad=True)
b = torch.zeros(1,requires_grad=True)

#optimizer 설정
optimizer = optim.Adam([W,b],lr=1e-3)

nb_epochs=30000
for epoch in range(nb_epochs+1):
    
    #H(x)계산
    hypothesis = F.softmax(x_train.matmul(W)+b,dim=1)   
    
    # cross_entropy를 사용하면 scatter함수를 이용한 one_hot_encoding 안해도 됨
    cost = F.cross_entropy((x_train.matmul(W)+b),y_train_encoded)
    
    #cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()
    
    #100번마다 로그 출력
    if epoch % 10000 == 0:
        print('Epoch {:4d}/{} Cost: {:.6f}'.format(
            epoch, nb_epochs, cost.item()
        ))

Epoch    0/30000 Cost: 3.091042
Epoch 10000/30000 Cost: 0.126731
Epoch 20000/30000 Cost: 0.059008
Epoch 30000/30000 Cost: 0.046098


In [14]:
# 학습된 W,b를 통한 클래스 예측
hypothesis = F.softmax(x_train.matmul(W)+b,dim=1)
predict = torch.argmax(hypothesis,dim=1) #index 반환
correct_prediction = (predict == y_train_encoded).sum() / len(y_train_encoded)

print(hypothesis)
print(predict)
print('accuracy: {:.6f}%'.format(correct_prediction*100))

tensor([[2.0164e-34, 4.1096e-41, 1.1676e-12,  ..., 1.7416e-20, 2.5790e-35,
         6.7233e-37],
        [3.6631e-14, 6.3870e-35, 6.1162e-20,  ..., 9.9996e-01, 2.4162e-24,
         6.4960e-19],
        [7.3791e-09, 9.5852e-18, 3.8133e-24,  ..., 1.9338e-43, 6.2361e-09,
         0.0000e+00],
        ...,
        [1.4717e-27, 1.0331e-34, 4.6723e-16,  ..., 1.5985e-16, 2.0818e-32,
         1.0302e-30],
        [2.7183e-05, 1.5337e-30, 0.0000e+00,  ..., 1.3664e-18, 4.5486e-37,
         6.9087e-24],
        [4.2144e-19, 9.2806e-24, 3.0201e-03,  ..., 1.0398e-21, 6.4953e-31,
         6.5933e-29]], grad_fn=<SoftmaxBackward0>)
tensor([13, 19, 17,  ..., 12,  7, 10])
accuracy: 98.121216%


# 데이터 평가

In [15]:
test = pd.read_csv('../input/2022-ai-w4p2/test.csv')
test

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,101,17,47,29.494014,94.729813,6.185053,26.308209
1,98,8,51,26.179346,86.522581,6.259336,49.430510
2,59,62,49,43.360515,93.351916,6.941497,114.778071
3,44,60,55,34.280461,90.555616,6.825371,98.540477
4,30,137,200,22.914300,90.704756,5.603413,118.604465
...,...,...,...,...,...,...,...
545,100,32,26,25.234661,57.531615,6.043486,124.226174
546,50,46,52,31.182984,90.216469,6.734006,54.018724
547,30,65,82,20.714244,15.278241,7.103798,76.778887
548,93,26,27,24.592457,56.468296,7.288212,137.704405


## 정규화

In [16]:
x_test = scaler.transform(test)
x_test

array([[ 1.34377836, -1.09505904, -0.0201548 , ...,  1.05007959,
        -0.37185066, -1.37841507],
       [ 1.2633554 , -1.36622173,  0.05837038, ...,  0.67642042,
        -0.27629569, -0.9601543 ],
       [ 0.21785687,  0.26075443,  0.01910779, ...,  0.98734665,
         0.60121475,  0.22192187],
       ...,
       [-0.55956511,  0.35114199,  0.66694049, ..., -2.56719443,
         0.80999403, -0.46544778],
       [ 1.12931713, -0.82389634, -0.41278067, ..., -0.69189211,
         1.04721835,  0.63663777],
       [ 0.32508749, -0.19118339, -0.25573032, ...,  0.7917565 ,
         1.23317088,  1.65276317]])

In [17]:
x_test = torch.FloatTensor(x_test)
x_test

tensor([[ 1.3438, -1.0951, -0.0202,  ...,  1.0501, -0.3719, -1.3784],
        [ 1.2634, -1.3662,  0.0584,  ...,  0.6764, -0.2763, -0.9602],
        [ 0.2179,  0.2608,  0.0191,  ...,  0.9873,  0.6012,  0.2219],
        ...,
        [-0.5596,  0.3511,  0.6669,  ..., -2.5672,  0.8100, -0.4654],
        [ 1.1293, -0.8239, -0.4128,  ..., -0.6919,  1.0472,  0.6366],
        [ 0.3251, -0.1912, -0.2557,  ...,  0.7918,  1.2332,  1.6528]])

In [18]:
hypothesis = F.softmax(x_test.matmul(W)+b,dim=1)
hypothesis[:3]

tensor([[2.8515e-31, 1.2399e-12, 8.9860e-37, 2.5223e-44, 5.5708e-28, 5.6156e-17,
         4.0715e-10, 8.3862e-30, 1.1915e-14, 0.0000e+00, 3.2545e-26, 1.1052e-22,
         3.1783e-31, 4.7685e-25, 1.3466e-18, 1.0000e+00, 1.7519e-26, 1.2544e-21,
         0.0000e+00, 4.2999e-15, 3.4802e-20, 5.1325e-08],
        [1.0239e-35, 2.5879e-17, 1.5416e-39, 7.7898e-37, 7.0414e-25, 1.4203e-11,
         5.9057e-09, 1.1963e-34, 3.4540e-13, 2.8026e-45, 1.9371e-30, 9.4231e-18,
         2.9145e-24, 1.2627e-22, 9.5651e-23, 9.5960e-05, 1.1526e-24, 1.2564e-27,
         0.0000e+00, 1.5082e-13, 4.4199e-17, 9.9990e-01],
        [1.4940e-12, 8.9270e-16, 2.9808e-25, 0.0000e+00, 2.5920e-27, 2.7545e-35,
         6.1964e-36, 3.4438e-21, 7.0252e-23, 0.0000e+00, 2.4609e-34, 0.0000e+00,
         4.3221e-37, 1.3374e-37, 1.0827e-24, 4.8588e-14, 5.9947e-33, 1.0000e+00,
         1.5390e-32, 4.4742e-36, 6.1050e-26, 0.0000e+00]],
       grad_fn=<SliceBackward0>)

In [19]:
# 학습된 W,b를 통한 클래스 예측
predict = torch.argmax(hypothesis,dim=1) #index 반환

print(predict[:3])

tensor([15, 21, 17])


In [20]:
hypothesis = encoder.inverse_transform(predict.detach().numpy())
hypothesis[:3]

array(['muskmelon', 'watermelon', 'papaya'], dtype=object)

# 제출용 파일 만들기

In [21]:
prediction = pd.DataFrame(hypothesis,columns=['label'])
prediction

Unnamed: 0,label
0,muskmelon
1,watermelon
2,papaya
3,papaya
4,apple
...,...
545,coffee
546,papaya
547,chickpea
548,coffee


In [22]:
submit = pd.read_csv('../input/2022-ai-w4p2/sample.csv')
submit

Unnamed: 0,id,label
0,0,coffee
1,1,coffee
2,2,coffee
3,3,coffee
4,4,coffee
...,...,...
545,545,coffee
546,546,coffee
547,547,coffee
548,548,coffee


In [23]:
submit.update(prediction)
submit

Unnamed: 0,id,label
0,0,muskmelon
1,1,watermelon
2,2,papaya
3,3,papaya
4,4,apple
...,...,...
545,545,coffee
546,546,papaya
547,547,chickpea
548,548,coffee


In [24]:
submit.to_csv('submit.csv',index=False)
submit

Unnamed: 0,id,label
0,0,muskmelon
1,1,watermelon
2,2,papaya
3,3,papaya
4,4,apple
...,...,...
545,545,coffee
546,546,papaya
547,547,chickpea
548,548,coffee
