In [4]:
import pandas as pd

In [23]:
data = pd.read_csv('../data/input_table/pre_dataset.csv')
data = data.sample(frac=1, random_state=42, ignore_index=True)

In [24]:
data.station.nunique()

100

In [26]:
data['t+6'].mean()

0.18105744520030234

In [7]:

data.head()

Unnamed: 0,t-11,t-10,t-9,t-8,t-7,t-6,t-5,t-4,t-3,t-2,t-1,t0,station,sin_time,cos_time,dow,weekend,t+6
0,0,0,0,0,0,1,1,1,1,1,1,0,16,-0.173648,-0.984808,4,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,28,-0.258819,0.965926,2,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,86,0.991445,0.130526,5,1,0
3,1,1,1,1,0,0,0,0,0,0,0,0,72,-0.953717,-0.300706,2,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,9,0.300706,0.953717,6,1,0


In [8]:
data.shape

(1323000, 18)

In [9]:
TRAIN_NUM = 1_000_000
train_data = data.iloc[:TRAIN_NUM]
valid_data = data.iloc[TRAIN_NUM:]
len(train_data), len(valid_data)

(1000000, 323000)

In [10]:
# feature definition
general_features = [col for col in data.columns if col not in ['station', 't+6']]
embedding_features = ['station']
target_features = ['t+6']

In [11]:
train_data[general_features].values.shape

(1000000, 16)

## Dataset 정의

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class EvcBaseDataset(Dataset):
    def __init__(self, xs, ys):
        assert len(xs) == len(ys)

        self.xs = torch.tensor(xs).float()
        self.ys = torch.tensor(ys).float()

    def __len__(self):
        return len(self.xs)

    def __getitem__(self, i):
        x, y = self.xs[i], self.ys[i]
        return x, y


class EvcEmbeddingDataset(EvcBaseDataset):
    def __init__(self, xs, es, ys):
        assert len(xs) == len(ys)

        self.xs = torch.tensor(xs).float()
        self.es = torch.tensor(es)
        self.ys = torch.tensor(ys).float()
        
    def __len__(self):
        return len(self.xs)

    def __getitem__(self, i):
        x, e, y = self.xs[i], self.es[i], self.ys[i]
        return x, e, y

In [13]:
trainset_base = EvcBaseDataset(xs=train_data[general_features].values,
                               ys=train_data[target_features].values)
validset_base = EvcBaseDataset(xs=valid_data[general_features].values,
                               ys=valid_data[target_features].values)               


trainset_emb = EvcEmbeddingDataset(xs=train_data[general_features].values,
                                   es=train_data[embedding_features].values.flatten(),
                                   ys=train_data[target_features].values)
validset_emb = EvcEmbeddingDataset(xs=valid_data[general_features].values,
                                   es=valid_data[embedding_features].values.flatten(),
                                   ys=valid_data[target_features].values)           

In [14]:
train_loader_base = DataLoader(trainset_base, batch_size=256)
valid_loader_base = DataLoader(validset_base, batch_size=1024)

train_loader_emb = DataLoader(trainset_emb, batch_size=256)
valid_loader_emb = DataLoader(validset_emb, batch_size=1024)

## Model 정의

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BaseMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(16, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.sigmoid(x)


class BaseEmbeddingMLP(nn.Module):
    def __init__(self, station_size, n_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=station_size, embedding_dim=n_dim)
        self.fc1 = nn.Linear(16+n_dim, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x, e):
        e = self.embedding(e)
        x = torch.cat((x, e), dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return torch.sigmoid(x)

## train

In [18]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

def train(model, train_dataloader, optim, epoch):
    model.train()
    criterion = nn.BCELoss()
    for b_i, (X, y) in enumerate(train_dataloader):
        optim.zero_grad()
        pred_prob = model(X)
        loss = criterion(pred_prob, y)
        loss.backward()
        optim.step()
        if b_i % 1000 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(X), len(train_dataloader.dataset),
                100 * b_i / len(train_dataloader), loss.item()
            ))

def test(model, test_dataloader):
    model.eval()
    criterion = nn.BCELoss(reduction='sum')
    loss = 0
    # success = 0
    with torch.no_grad():
        pred_prob_total = torch.Tensor()
        y_total = torch.Tensor()

        for X, y in test_dataloader:
            prob_pred = model(X)
            pred_prob_total = torch.cat([pred_prob_total, prob_pred.flatten()], dim=0)
            y_total = torch.cat([y_total, y.flatten()], dim=0)

            loss += criterion(prob_pred, y).item()
    loss /= len(test_dataloader.dataset)

    y_pred = torch.round(pred_prob_total)
    accuracy = accuracy_score(y_total, y_pred)
    precision = precision_score(y_total, y_pred)
    recall = recall_score(y_total, y_pred)
    f1 = f1_score(y_total, y_pred)

    auc_score = roc_auc_score(y_total, pred_prob_total)
    print('\nTest dataset:  Loss: {:.4f}, Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1: {:.2f}, AUC: {:.2f}'.format(
        loss, accuracy, precision, recall, f1, auc_score))


def train_embnet(model, train_dataloader, optim, epoch):
    model.train()
    criterion = nn.BCELoss()
    for b_i, (X, E, y) in enumerate(train_dataloader):
        optim.zero_grad()
        pred_prob = model(X,E)
        loss = criterion(pred_prob, y)
        loss.backward()
        optim.step()
        if b_i % 1000 == 0:
            print('epoch: {} [{}/{} ({:.0f}%)]\t training loss: {:.6f}'.format(
                epoch, b_i * len(X), len(train_dataloader.dataset),
                100 * b_i / len(train_dataloader), loss.item()
            ))

def test_embnet(model, test_dataloader):
    model.eval()
    criterion = nn.BCELoss(reduction='sum')
    loss = 0
    # success = 0
    with torch.no_grad():
        pred_prob_total = torch.Tensor()
        y_total = torch.Tensor()

        for X, E, y in test_dataloader:
            prob_pred = model(X, E)
            pred_prob_total = torch.cat([pred_prob_total, prob_pred.flatten()], dim=0)
            y_total = torch.cat([y_total, y.flatten()], dim=0)

            loss += criterion(prob_pred, y).item()
    loss /= len(test_dataloader.dataset)

    y_pred = torch.round(pred_prob_total)
    accuracy = accuracy_score(y_total, y_pred)
    precision = precision_score(y_total, y_pred)
    recall = recall_score(y_total, y_pred)
    f1 = f1_score(y_total, y_pred)

    auc_score = roc_auc_score(y_total, pred_prob_total)
    print('\nTest dataset:  Loss: {:.4f}, Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1: {:.2f}, AUC: {:.2f}'.format(
        loss, accuracy, precision, recall, f1, auc_score))

In [21]:
model = BaseMLP()
optim = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(1,21):
    train(model, train_loader_base, optim, epoch)
    test(model, valid_loader_base)
    print()



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4343, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4341, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4340, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4339, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4339, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70



  _warn_prf(average, modifier, msg_start, len(result))



Test dataset:  Loss: 0.4338, Accuracy: 0.82, Precision: 0.00, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.25, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.44, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.45, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.45, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.52, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.41, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4337, Accuracy: 0.82, Precision: 0.41, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4336, Accuracy: 0.82, Precision: 0.47, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss: 0.4336, Accuracy: 0.82, Precision: 0.40, Recall: 0.00, F1: 0.00, AUC: 0.70


Test dataset:  Loss

In [22]:
model_emb = BaseEmbeddingMLP(100, 16)
optim = torch.optim.Adam(model_emb.parameters(), lr=1e-3)

for epoch in range(1, 21):
    train_embnet(model_emb, train_loader_emb, optim, epoch)
    test_embnet(model_emb, valid_loader_emb)
    print()


Test dataset:  Loss: 0.4180, Accuracy: 0.82, Precision: 0.58, Recall: 0.03, F1: 0.06, AUC: 0.73


Test dataset:  Loss: 0.4146, Accuracy: 0.82, Precision: 0.57, Recall: 0.04, F1: 0.07, AUC: 0.74


Test dataset:  Loss: 0.4130, Accuracy: 0.82, Precision: 0.58, Recall: 0.04, F1: 0.08, AUC: 0.74


Test dataset:  Loss: 0.4119, Accuracy: 0.82, Precision: 0.58, Recall: 0.05, F1: 0.09, AUC: 0.74


Test dataset:  Loss: 0.4108, Accuracy: 0.82, Precision: 0.58, Recall: 0.05, F1: 0.09, AUC: 0.75


Test dataset:  Loss: 0.4098, Accuracy: 0.82, Precision: 0.59, Recall: 0.05, F1: 0.09, AUC: 0.75


Test dataset:  Loss: 0.4090, Accuracy: 0.82, Precision: 0.60, Recall: 0.05, F1: 0.09, AUC: 0.75


Test dataset:  Loss: 0.4085, Accuracy: 0.82, Precision: 0.60, Recall: 0.05, F1: 0.10, AUC: 0.75


Test dataset:  Loss: 0.4081, Accuracy: 0.82, Precision: 0.61, Recall: 0.05, F1: 0.10, AUC: 0.75


Test dataset:  Loss: 0.4076, Accuracy: 0.82, Precision: 0.60, Recall: 0.06, F1: 0.10, AUC: 0.75


Test dataset:  Loss