In [1]:
import numpy as np
import pandas as pd

In [2]:
import torch
import sklearn
import random

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [4]:
torch.manual_seed(1)
random.seed(1)
if device == "cuda":
    torch.cuda.manual_seed(1)

In [5]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self,x,y):
        self.x = torch.Tensor(x).to(device)
        self.y = torch.Tensor(y).to(device)
    def __getitem__(self,index):
        return self.x[index],self.y[index]
    def __len__(self):
        return len(self.x)

In [6]:
x_train = pd.read_csv("./data2/x_train.csv",engine='python',encoding='CP949')
y_train = pd.read_csv("./data2/y_train.csv",engine='python',encoding='CP949')
x_test = pd.read_csv("./data2/x_test.csv",engine='python',encoding='CP949')
submission = pd.read_csv("./data2/sample_submission.csv")

In [7]:
print(x_train)
print(y_train)
print(x_test)
print(submission)

       시도명  읍면동명         거주인구        근무인구        방문인구       총 유동인구   평균 속도  \
0        0    26    76018.965    5009.811   36887.341   117916.117  40.467   
1        1    24  1232416.968   76043.955  451558.268  1760019.191  38.126   
2        1    23   134260.946    8412.761   78177.981   220851.688  51.107   
3        1    25   289327.429   39102.424  287029.303   615459.156  33.067   
4        1     7  1116617.660   96560.651  524905.301  1738083.612  40.392   
...    ...   ...          ...         ...         ...          ...     ...   
21481    0     6    91977.598   14802.819   79885.446   186665.863  41.676   
21482    0     9    63673.610    3725.663   65813.385   133212.658  48.737   
21483    1     3   406035.642   46335.550  258079.137   710450.329  49.314   
21484    1    25   315737.894   25379.224  283079.548   624196.666  35.312   
21485    1    27  1092141.759  185878.376  518636.283  1796656.419  33.438   

       평균 소요 시간   평균 기온    일강수량  평균 풍속   월   일     년  
0       

In [8]:
x_train = np.array(x_train.drop(['월','일','년'],axis=1))
# y_train = np.array(y_train) # shape (21486,1)
y_train = np.array(y_train['교통량']) # shape (21486)
x_test = np.array(x_test.drop(['월','일','년'],axis=1))

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
# x_train = torch.Tensor(x_train).to(device)
x_test = torch.Tensor(x_test).to(device)
# y_train = torch.Tensor(y_train).to(device)
train_dataset = MyDataset(x_train,y_train)
print(x_train.shape,y_train.shape)

(21486, 11) (21486,)


In [10]:
layer1 = torch.nn.Linear(11,11,bias=True).to(device)
layer2 = torch.nn.Linear(11,8,bias=True).to(device)
layer3 = torch.nn.Linear(8,8,bias=True).to(device)
layer4 = torch.nn.Linear(8,11,bias=True).to(device)
layer5 = torch.nn.Linear(11,1,bias=True).to(device)
relu = torch.nn.ReLU().to(device)
dropout = torch.nn.Dropout(0.5).to(device)

In [11]:
torch.nn.init.xavier_normal_(layer1.weight)
torch.nn.init.xavier_normal_(layer2.weight)
torch.nn.init.xavier_normal_(layer3.weight)
torch.nn.init.xavier_normal_(layer4.weight)
torch.nn.init.xavier_normal_(layer5.weight)


Parameter containing:
tensor([[-0.5832, -0.4439,  0.2270,  0.4153, -0.1420, -0.5565, -0.5376,  0.3844,
          0.2358, -0.1633, -0.1858]], device='cuda:0', requires_grad=True)

In [12]:
model = torch.nn.Sequential(layer1,relu,dropout,
                            layer2,relu,dropout,
                            layer3,relu,dropout,
                            layer4,relu,dropout,
                            layer5).to(device)

In [13]:
def r2_loss(output, target):
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

In [14]:
# layer 5, epochs 2000, lr 1e-2 -> score 0.474...
# layer 5, epochs 2000, lr 1e-1 -> score 0.385... overfitting
# layer 7, epochs 2000, lr 1e-2 -> cost 105335.1015625 score 0.11277 
# batch size = 100 layer 7, epochs 1000, lr 1e-2 -> 하나의 예측값
# batch size = 100 layer 7, epochs 1000, lr 1e-2 -> 하나의 예측값 
epochs= 100
lr = 1e-2
loss = torch.nn.MSELoss().to(device)

optim = torch.optim.Adam(model.parameters(),lr=lr)
batch_size = 100
data_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                         batch_size = batch_size,
                                         shuffle=True,
                                         drop_last =True)

In [15]:
model.train()
for epoch in range(epochs+1):
    avg_cost = 0
    for x_train,y_train in data_loader:
        optim.zero_grad()
    
        h = model(x_train) 
        cost = loss(h,y_train.unsqueeze(1))
        cost.backward()
        optim.step()
        avg_cost += cost.item()
    if epoch % 10 == 0:
        print(epoch, avg_cost/len(data_loader))

0 318297.4786799065
10 229692.11320823597
20 226315.3324693341
30 226722.68136682242
40 225148.92253212616
50 219045.04326080607
60 220794.96579293226
70 216510.9640771028
80 218048.35404497664
90 225301.32049503503
100 218963.37317464955


In [16]:
with torch.no_grad():
    model.eval()
    predict = model(x_train)
    
print("[R2_Loss] =",r2_loss(predict,y_train.unsqueeze(1)))
l1Loss = torch.nn.L1Loss()
print("[L1_Loss] =",l1Loss(predict,y_train.unsqueeze(1)))

[R2_Loss] = tensor(0.3869, device='cuda:0')
[L1_Loss] = tensor(347.2535, device='cuda:0')


In [17]:
with torch.no_grad():
    model.eval()
    predict = model(x_test)
submission['predict'] = predict.cpu().detach().numpy()
print(submission)

          id     predict
0          0  202.361298
1          1  301.128357
2          2  262.122986
3          3  269.576385
4          4  554.105835
...      ...         ...
10578  10578  429.194885
10579  10579  202.361298
10580  10580  400.750397
10581  10581  202.870361
10582  10582  418.180725

[10583 rows x 2 columns]


In [18]:
submission.to_csv("submission.csv",index=False)

In [19]:
# 저번 과제와 같이 한가지 예측값으로 통일되는 경향이 생김 -> batch size?
# batch size를 적용해도 계속 같은 문제 -> lr 조정
# lr를 적용해도 계속 같은 문제 -> drop late 조정
# drop column (연,월,일)