In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/2022-ai-w3p1/train.csv
/kaggle/input/2022-ai-w3p1/test.csv
/kaggle/input/2022-ai-w3p1/submit_sample.csv


# X와 y csv 파일 설명

X: (1) 배추 가격에 대한 날짜 (2) 평균기온 (3) 최저기온 (4) 최고기온 (5) 강수량

y: 해당 배추의 가격을 예측


In [2]:
import random
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F


seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

## 데이터셋 만들기

pandas: dataframe, series

In [3]:
train = pd.read_csv('../input/2022-ai-w3p1/train.csv')
train

Unnamed: 0,year,avgTemp,minTemp,maxTemp,rainFall,avgPrice
0,20100101,-4.9,-11.0,0.9,0.0,2123
1,20100102,-3.1,-5.5,5.5,0.8,2123
2,20100103,-2.9,-6.9,1.4,0.0,2123
3,20100104,-1.8,-5.1,2.2,5.9,2020
4,20100105,-5.2,-8.7,-1.8,0.7,2060
...,...,...,...,...,...,...
2552,20161227,1.2,-3.2,4.5,2.1,4226
2553,20161228,-1.4,-6.5,4.8,0.0,4200
2554,20161229,-1.3,-4.5,2.1,0.2,4224
2555,20161230,-1.4,-7.0,4.6,0.0,4219


In [4]:
X_train = train.iloc[:,:-1].to_numpy()
X_train = torch.FloatTensor(X_train)
X_train

tensor([[ 2.0100e+07, -4.9000e+00, -1.1000e+01,  9.0000e-01,  0.0000e+00],
        [ 2.0100e+07, -3.1000e+00, -5.5000e+00,  5.5000e+00,  8.0000e-01],
        [ 2.0100e+07, -2.9000e+00, -6.9000e+00,  1.4000e+00,  0.0000e+00],
        ...,
        [ 2.0161e+07, -1.3000e+00, -4.5000e+00,  2.1000e+00,  2.0000e-01],
        [ 2.0161e+07, -1.4000e+00, -7.0000e+00,  4.6000e+00,  0.0000e+00],
        [ 2.0161e+07,  1.7000e+00, -2.9000e+00,  7.1000e+00,  0.0000e+00]])

In [5]:
y_train = train.iloc[:,-1].to_numpy()
y_train = torch.FloatTensor(y_train)
y_train

tensor([2123., 2123., 2123.,  ..., 4224., 4219., 4219.])

In [6]:
print(X_train.shape)
print(y_train.shape)

torch.Size([2557, 5])
torch.Size([2557])


In [7]:
#broadcasting을 위해 크기 맞추기
y_train = y_train.unsqueeze(1)
print(y_train.shape)

torch.Size([2557, 1])


# 모델 학습하기

In [8]:
#모델초기화
W = torch.zeros([5,1],requires_grad = True)
b = torch.zeros(1,requires_grad = True)

#optimizer설정
optimizer = optim.Adam([W,b],lr = 1e-5)

nb_epochs = 100000
for epoch in range(nb_epochs+1):
    #H(x) 계산
    hypothesis = X_train.matmul(W) + b

    #cost 계산 (RMSE)
    cost = torch.mean((hypothesis - y_train)**2)
    cost = torch.sqrt(cost)

    #cost로 H(x) 개선
    optimizer.zero_grad()
    cost.backward()
    optimizer.step()

    if epoch%10000 == 0:
        print('Epoch {:4d}/{} hypothesis: {} Cost:{:.6f}'.format(
            epoch, nb_epochs, hypothesis.squeeze().detach(), cost.item()
        ))

Epoch    0/100000 hypothesis: tensor([0., 0., 0.,  ..., 0., 0., 0.]) Cost:3466.026855
Epoch 10000/100000 hypothesis: tensor([3152.3665, 3153.5632, 3152.9966,  ..., 3163.0520, 3163.0132,
        3163.9302]) Cost:1345.670532
Epoch 20000/100000 hypothesis: tensor([3177.8691, 3180.3328, 3179.1584,  ..., 3189.7844, 3189.7158,
        3191.6008]) Cost:1344.849121
Epoch 30000/100000 hypothesis: tensor([3192.5723, 3196.3030, 3194.5200,  ..., 3205.6848, 3205.5854,
        3208.4385]) Cost:1344.558594
Epoch 40000/100000 hypothesis: tensor([3177.2129, 3182.2109, 3179.8196,  ..., 3191.4309, 3191.3018,
        3195.1230]) Cost:1344.009644
Epoch 50000/100000 hypothesis: tensor([3160.6101, 3166.8755, 3163.8755,  ..., 3175.9299, 3175.7710,
        3180.5603]) Cost:1343.536377
Epoch 60000/100000 hypothesis: tensor([3154.8225, 3162.3525, 3158.7451,  ..., 3171.2749, 3171.0854,
        3176.8413]) Cost:1343.115234
Epoch 70000/100000 hypothesis: tensor([3149.0396, 3157.8340, 3153.6191,  ..., 3166.6240, 316

In [9]:
print(W)
print(b)

tensor([[1.5740e-04],
        [9.9126e-01],
        [9.9370e-01],
        [9.8874e-01],
        [9.8792e-01]], requires_grad=True)
tensor([0.0184], requires_grad=True)


## 모델 테스트하기

In [10]:
X_test = pd.read_csv('../input/2022-ai-w3p1/test.csv').to_numpy()
X_test = torch.FloatTensor(X_test)
X_test

tensor([[ 2.0170e+07,  4.0000e+00, -1.4000e+00,  9.5000e+00,  1.0000e-01],
        [ 2.0170e+07, -1.7000e+00, -5.4000e+00,  1.4000e+00,  3.3000e+00],
        [ 2.0170e+07,  1.4000e+00, -2.0000e+00,  5.0000e+00,  0.0000e+00],
        [ 2.0170e+07,  3.6000e+00, -2.5000e+00,  1.1100e+01,  1.0000e-01],
        [ 2.0170e+07,  5.0000e+00, -1.6000e+00,  1.1300e+01,  0.0000e+00],
        [ 2.0170e+07,  7.3000e+00,  7.0000e-01,  1.3600e+01,  7.0000e-01],
        [ 2.0170e+07,  1.2300e+01,  7.2000e+00,  1.7700e+01,  0.0000e+00],
        [ 2.0170e+07,  1.9800e+01,  1.1900e+01,  2.7700e+01,  0.0000e+00],
        [ 2.0171e+07,  1.7500e+01,  1.2300e+01,  2.4500e+01,  2.9000e+00],
        [ 2.0171e+07,  2.0600e+01,  1.1800e+01,  2.8300e+01,  0.0000e+00],
        [ 2.0171e+07,  1.8400e+01,  1.5900e+01,  2.1200e+01,  5.5000e+00],
        [ 2.0171e+07,  2.4700e+01,  1.8600e+01,  3.1500e+01,  0.0000e+00],
        [ 2.0171e+07,  2.6100e+01,  2.4100e+01,  2.8500e+01,  2.6700e+01],
        [ 2.0171e+07,  2.

In [11]:
y_test = X_test.matmul(W) + b
y_test = y_test.detach().numpy().squeeze()
y_test = pd.DataFrame(y_test,columns=['Expected'])
y_test

Unnamed: 0,Expected
0,3186.884766
1,3172.414062
2,3179.181152
3,3186.996094
4,3189.389648
5,3196.923584
6,3211.7146
7,3233.709961
8,3231.54126
9,3235.011963


In [12]:
Id = pd.DataFrame(np.arange(24),columns=['Id'])
Id

Unnamed: 0,Id
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [13]:
result = pd.concat([Id,y_test],axis=1)
result

Unnamed: 0,Id,Expected
0,0,3186.884766
1,1,3172.414062
2,2,3179.181152
3,3,3186.996094
4,4,3189.389648
5,5,3196.923584
6,6,3211.7146
7,7,3233.709961
8,8,3231.54126
9,9,3235.011963


In [14]:
result.to_csv('submit.csv',index = False)
result

Unnamed: 0,Id,Expected
0,0,3186.884766
1,1,3172.414062
2,2,3179.181152
3,3,3186.996094
4,4,3189.389648
5,5,3196.923584
6,6,3211.7146
7,7,3233.709961
8,8,3231.54126
9,9,3235.011963
