In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from glob import glob


use_gpu = torch.cuda.is_available()

In [2]:
class DSet(Dataset):

    def __init__(self, samples, step=100):
        self.data = np.zeros((len(samples), step))
        self.target = np.zeros(len(samples), dtype=np.float32)
        raw_data = json.load(open('data/prop_span.json'))
        for i, sample in enumerate(samples):
            span = raw_data[sample]
            volumn = map(lambda x: int(x * step / 8.1), np.log10(span))
            for item in volumn:
                self.data[i][item] += 1
            if 'rumor' in sample:
                self.target[i] = 1

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        return torch.from_numpy(self.data[idx]).float(), self.target[idx]

In [43]:
class CNN(torch.nn.Module):

    def __init__(self, input_size):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 8, 3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, 3, padding=1)
        self.fc1 = nn.Linear(input_size // 4 * 16, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = x.view(x.size(0), 1, -1)
        x = F.relu(F.max_pool1d(self.conv1(x), 2))
        x = F.relu(F.max_pool1d(self.conv2(x), 2))
        x = F.dropout(x.view(x.size(0), -1), training=self.training)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.sigmoid(x)

In [44]:
class RNN(torch.nn.Module):

    def __init__(self, input_size, hidden_size=64, bidirectional=True):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_directions = 2 if bidirectional else 1
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=bidirectional)
        self.fc = nn.Linear(hidden_size * self.n_directions, 1)

    def forward(self, x):
        x = x.view(x.size(0), -1, self.input_size)
        h0 = self._init_hidden_state(x.size(0))
        x, hn = self.rnn(x, h0)
        x = self.fc(x[:, -1, :])
        return F.sigmoid(x)

    def _init_hidden_state(self, batch_size):
        h0 = torch.zeros(self.n_directions, batch_size, self.hidden_size)
        if use_gpu:
            h0 = h0.cuda()
        return Variable(h0)

In [51]:
def train(model, n_epoch=20):

    if use_gpu:
        model.cuda()
    criterion = nn.BCELoss()
    optimizer = optim.RMSprop(model.parameters())
    model_type = model.__class__.__name__

    for epoch in range(n_epoch):
        print(f'Epoch {(epoch + 1):02d}')
        tr_loss, val_loss, tr_acc, val_acc = 0.0, 0.0, 0.0, 0.0
        model.train()
        for data, target in train_loader:
            target = target.view(target.size(0), 1)
            optimizer.zero_grad()
            if use_gpu:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            tr_loss += loss.data[0] * data.size(0)
            pred = torch.sign(output.data - 0.5).clamp_(min=0)
            tr_acc += pred.eq(target.data).cpu().sum()
        tr_loss /= len(train_loader.dataset)
        tr_acc = tr_acc / len(train_loader.dataset) * 100
        print(f'tr_loss {tr_loss:.6f} | tr_acc {tr_acc:.2f}%')

        model.eval()
        for data, target in test_loader:
            target = target.view(target.size(0), 1)
            if use_gpu:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.data[0] * data.size(0)
            pred = torch.sign(output.data - 0.5).clamp_(min=0)
            val_acc += pred.eq(target.data).cpu().sum()
        val_loss /= len(test_loader.dataset)
        val_acc = val_acc / len(test_loader.dataset) * 100
        print(f'val_loss {val_loss:.6f} | val_acc {val_acc:.2f}%')

In [8]:
samples = glob('rumor/*.json') + glob('truth/*.json')
train_data, test_data = train_test_split(samples, test_size=0.2, random_state=42)    
kwargs = {'num_workers': 1, 'pin_memory': True} if use_gpu else {}
train_loader = DataLoader(DSet(train_data), batch_size=128, **kwargs)
test_loader = DataLoader(DSet(test_data), batch_size=128, **kwargs)

In [48]:
train(CNN(100), 10, 10)

Epoch 01
tr_loss 1.126192 | tr_acc 58.91%
val_loss 0.534214 | val_acc 78.46%
Epoch 02
tr_loss 0.565489 | tr_acc 73.12%
val_loss 0.541827 | val_acc 72.78%
Epoch 03
tr_loss 0.518462 | tr_acc 75.90%
val_loss 0.539904 | val_acc 74.92%
Epoch 04
tr_loss 0.512484 | tr_acc 76.33%
val_loss 0.498259 | val_acc 77.06%
Epoch 05
tr_loss 0.495490 | tr_acc 78.10%
val_loss 0.483247 | val_acc 78.67%
Epoch 06
tr_loss 0.474987 | tr_acc 78.69%
val_loss 0.527342 | val_acc 79.21%
Epoch 07
tr_loss 0.484572 | tr_acc 78.05%
val_loss 0.464461 | val_acc 80.49%
Epoch 08
tr_loss 0.468512 | tr_acc 80.11%
val_loss 0.461572 | val_acc 81.78%
Epoch 09
tr_loss 0.459016 | tr_acc 79.98%
val_loss 0.464334 | val_acc 81.14%
Epoch 10
tr_loss 0.466292 | tr_acc 79.42%
val_loss 0.502500 | val_acc 78.56%


In [12]:
rnn = RNN(10)

In [18]:
rnn.__class__.__name__

'RNN'

In [58]:
class RCN(torch.nn.Module):

    def __init__(self, input_size, step_size, hidden_size=64):
        super(RCN, self).__init__()
        self.hidden_size = hidden_size
        self.step_size = step_size
        self.rnn = nn.GRU(step_size, hidden_size, batch_first=True, bidirectional=True, dropout=0.5)
        self.conv1 = nn.Conv1d(1, 8, 3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, 3, padding=1)
        self.fc_dim = input_size // 4 * 16 + hidden_size * 2
        self.fc = nn.Linear(self.fc_dim, 1)

    def forward(self, x):
        rx = x.view(x.size(0), -1, self.step_size)
        cx = x.view(x.size(0), 1, -1)
        h0 = self._init_hidden_state(rx.size(0))
        rx, hn = self.rnn(rx, h0)
        cx = F.relu(F.max_pool1d(self.conv1(cx), 2))
        cx = F.relu(F.max_pool1d(self.conv2(cx), 2))
        rcx = torch.cat((rx[:, -1, :].view(x.size(0), -1), cx.view(x.size(0), -1)), dim=1)
        out = self.fc(F.dropout(rcx, training=self.training))
        return F.sigmoid(out)

    def _init_hidden_state(self, batch_size):
        h0 = torch.zeros(2, batch_size, self.hidden_size)
        if use_gpu:
            h0 = h0.cuda()
        return Variable(h0)

In [59]:
train(RCN(100, 10))

Epoch 01
tr_loss 1.565200 | tr_acc 66.87%
val_loss 0.732278 | val_acc 57.66%
Epoch 02
tr_loss 0.508685 | tr_acc 77.03%
val_loss 0.535610 | val_acc 74.06%
Epoch 03
tr_loss 0.455223 | tr_acc 79.84%
val_loss 0.530129 | val_acc 76.31%
Epoch 04
tr_loss 0.433536 | tr_acc 81.53%
val_loss 0.458880 | val_acc 81.46%
Epoch 05
tr_loss 0.418026 | tr_acc 83.14%
val_loss 0.451677 | val_acc 80.81%
Epoch 06
tr_loss 0.407422 | tr_acc 82.87%
val_loss 0.446012 | val_acc 82.10%
Epoch 07
tr_loss 0.386277 | tr_acc 83.60%
val_loss 0.461455 | val_acc 81.89%
Epoch 08
tr_loss 0.377711 | tr_acc 84.43%
val_loss 0.462122 | val_acc 82.21%
Epoch 09
tr_loss 0.352311 | tr_acc 85.31%
val_loss 0.461295 | val_acc 81.56%
Epoch 10
tr_loss 0.338566 | tr_acc 85.90%
val_loss 0.484886 | val_acc 82.10%
Epoch 11
tr_loss 0.320860 | tr_acc 86.60%
val_loss 0.484532 | val_acc 81.67%
Epoch 12
tr_loss 0.302745 | tr_acc 87.40%
val_loss 0.509570 | val_acc 80.28%
Epoch 13
tr_loss 0.295603 | tr_acc 87.72%
val_loss 0.529323 | val_acc 79.85%