In [1]:
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from glob import glob


use_gpu = torch.cuda.is_available()

In [2]:
class DSet(Dataset):

    def __init__(self, samples, step=100):
        self.data = np.zeros((len(samples), step))
        self.target = np.zeros(len(samples), dtype=np.float32)
        raw_data = json.load(open('data/prop_span.json'))
        for i, sample in enumerate(samples):
            span = raw_data[sample]
            volumn = map(lambda x: int(x * step / 8.1), np.log10(span))
            for item in volumn:
                self.data[i][item] += 1
            if 'rumor' in sample:
                self.target[i] = 1

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        return torch.from_numpy(self.data[idx]).float(), self.target[idx]

In [3]:
class CNN(torch.nn.Module):

    def __init__(self, input_size):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 8, 3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, 3, padding=1)
        self.fc_in = input_size // 4 * 16
        self.fc1 = nn.Linear(self.fc_in, 64)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        x = F.relu(F.max_pool1d(self.conv1(x), 2))
        x = F.relu(F.max_pool1d(self.conv2(x), 2))
        x = F.dropout(x.view(-1, self.fc_in), training=self.training)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.sigmoid(x)

In [23]:
class RNN(torch.nn.Module):

    def __init__(self, input_size, hidden_size=64, bidirectional=True):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_directions = 2 if bidirectional else 1
        self.rnn = nn.GRU(input_size, hidden_size, batch_first=True, bidirectional=True, dropout=0.5)
        self.fc = nn.Linear(hidden_size * self.n_directions, 1)

    def forward(self, x):
        h0 = self._init_hidden_state(x.size(0))
        x, hn = self.rnn(x, h0)
        x = self.fc(F.dropout(x[:, -1, :], training=self.training))
        return F.sigmoid(x)

    def _init_hidden_state(self, batch_size):
        h0 = torch.zeros(self.n_directions, batch_size, self.hidden_size)
        if use_gpu:
            h0 = h0.cuda()
        return Variable(h0)

In [19]:
def train(model, step_size, n_epoch=20):

    if use_gpu:
        model.cuda()
    criterion = nn.BCELoss()
    optimizer = optim.RMSprop(model.parameters())
    model_type = model.__class__.__name__

    for epoch in range(n_epoch):
        print('Epoch %03d:' % (epoch + 1))
        tr_loss, val_loss, tr_acc, val_acc = 0.0, 0.0, 0.0, 0.0
        model.train()
        for data, target in train_loader:
            if model_type == 'RNN':
                data = data.view(data.size(0), -1, step_size)
            elif model_type == 'CNN':
                data = data.view(data.size(0), 1, -1)
            target = target.view(target.size(0), 1)
            optimizer.zero_grad()
            if use_gpu:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            tr_loss += loss.data[0] * data.size(0)
            pred = torch.sign(output.data - 0.5).clamp_(min=0)
            tr_acc += pred.eq(target.data).cpu().sum()
        tr_loss /= len(train_loader.dataset)
        tr_acc = tr_acc / len(train_loader.dataset) * 100
        print(f'tr_loss {tr_loss:.6f} | tr_acc {tr_acc:.2f}%')

        model.eval()
        for data, target in test_loader:
            if model_type == 'RNN':
                data = data.view(data.size(0), -1, step_size)
            elif model_type == 'CNN':
                data = data.view(data.size(0), 1, -1)
            target = target.view(target.size(0), 1)
            if use_gpu:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data, volatile=True), Variable(target)
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.data[0] * data.size(0)
            pred = torch.sign(output.data - 0.5).clamp_(min=0)
            val_acc += pred.eq(target.data).cpu().sum()
        val_loss /= len(test_loader.dataset)
        val_acc = val_acc / len(test_loader.dataset) * 100
        print(f'val_loss {val_loss:.6f} | val_acc {val_acc:.2f}%')

In [8]:
samples = glob('rumor/*.json') + glob('truth/*.json')
train_data, test_data = train_test_split(samples, test_size=0.2, random_state=42)    
kwargs = {'num_workers': 1, 'pin_memory': True} if use_gpu else {}
train_loader = DataLoader(DSet(train_data), batch_size=128, **kwargs)
test_loader = DataLoader(DSet(test_data), batch_size=128, **kwargs)

In [24]:
train(RNN(10), 10, 10)

Epoch 001:
tr_loss 0.734407 | tr_acc 59.42%%
val_loss 0.588298 | val_acc 66.99%%
Epoch 002:
tr_loss 0.473139 | tr_acc 79.74%%
val_loss 0.505089 | val_acc 76.31%%
Epoch 003:
tr_loss 0.443367 | tr_acc 81.24%%
val_loss 0.507804 | val_acc 78.14%%
Epoch 004:
tr_loss 0.419608 | tr_acc 82.50%%
val_loss 0.494735 | val_acc 78.78%%
Epoch 005:
tr_loss 0.411384 | tr_acc 82.50%%
val_loss 0.453101 | val_acc 80.49%%
Epoch 006:
tr_loss 0.394824 | tr_acc 83.68%%
val_loss 0.464016 | val_acc 80.81%%
Epoch 007:
tr_loss 0.373539 | tr_acc 84.59%%
val_loss 0.473353 | val_acc 81.03%%
Epoch 008:
tr_loss 0.359102 | tr_acc 85.12%%
val_loss 0.479471 | val_acc 80.49%%
Epoch 009:
tr_loss 0.341588 | tr_acc 86.01%%
val_loss 0.487963 | val_acc 79.31%%
Epoch 010:
tr_loss 0.329585 | tr_acc 86.46%%
val_loss 0.489413 | val_acc 81.46%%


In [12]:
rnn = RNN(10)

In [18]:
rnn.__class__.__name__

'RNN'