In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer
import numpy as np
import pandas as pd

In [2]:
# Read news data from 'data.csv' file
data = pd.read_csv('data.csv')

In [4]:
# Create fake/real news classifier using PyTorch
class ANN(nn.Module):
    def __init__(self):
        super(ANN,self).__init__()

        self.linear1 = nn.Linear(512,2000)
        self.relu1 = nn.ReLU()

        self.linear2 = nn.Linear(2000,500)
        self.relu2 = nn.ReLU()

        self.linear3 = nn.Linear(500,100)
        self.relu3 = nn.ReLU()

        self.linear4 = nn.Linear(100,20)
        self.relu4 = nn.ReLU()

        self.linear5 = nn.Linear(20,2)


    def forward(self,x):
        out = self.linear1(x)
        out = self.relu1(out)

        out = self.linear2(out)
        out = self.relu2(out)

        out = self.linear3(out)
        out = self.relu3(out)

        out = self.linear4(out)
        out = self.relu4(out)

        out = self.linear5(out)

        return out

In [3]:
data.dropna(inplace=True)

In [7]:
# Vectorize the data
# vectorizer_headline = TfidfVectorizer(max_features=100, stop_words='english')
# vectorizer_headline.fit(data.Headline)
#
# vectorizer_body = TfidfVectorizer(max_features=1000, stop_words='english')
# vectorizer_body.fit(data.Body)

In [4]:
data_hdl_bd, y_train = data[['Headline', 'Body']].apply(lambda x: x[0] + '\n\n' + x[1], axis=1).values,\
                       data.Label.values


In [5]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

inputs = tokenizer(list(data_hdl_bd), return_tensors="pt", truncation=True, padding=True)

In [10]:
# old try with vectorizer
# data_cp = data.copy()
#
# data_cp['vector_Headline'] = vectorizer_headline.transform(data_cp.Headline).data
# data_cp['vector_Body'] = vectorizer_body.transform(data_cp.Body).data


In [11]:
data.columns

Index(['URLs', 'Headline', 'Body', 'Label'], dtype='object')

In [6]:
data['tokens'] = [torch.from_numpy(x) for x in inputs['input_ids'].numpy().astype(np.float32)]
training_data = data[['tokens', 'Label']]

In [7]:
# Split the data into training and test sets
train_data, test_data = training_data.iloc[:int(len(training_data) * 0.8)],\
                        training_data.iloc[int(len(training_data) * 0.8):]

# Create PyTorch datasets
train_dataset = torch.utils.data.TensorDataset(torch.stack(tuple(train_data.tokens.values)),
                                               torch.tensor(train_data.Label.values))
test_dataset = torch.utils.data.TensorDataset(torch.stack(tuple(test_data.tokens.values)),
                                              torch.tensor(test_data.Label.values))

In [15]:
# Create PyTorch data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)


In [16]:
# Create a PyTorch model
model = ANN()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

# Loss function
error = nn.CrossEntropyLoss()

In [30]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter()

In [18]:
# Train the model
epochs = 1
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        # Forward pass
        outputs = model(inputs)
        loss = error(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            writer.add_scalar('Loss training:', loss.item())
            writer.add_graph(model, inputs)
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch + 1, epochs, i + 1, len(train_loader), loss.item()))

Epoch [1/1], Step [100/100], Loss: 0.6324


In [20]:
# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the test dataset: {} %'.format(100 * correct / total))

Accuracy of the network on the test dataset: 61.152882205513784 %


In [96]:
# Create LSTM model
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()

        self.lstm = nn.LSTM(input_size=512,hidden_size=512,num_layers=2,batch_first=True)
        self.linear1 = nn.Linear(512,2000)
        self.relu1 = nn.ReLU()

        self.linear2 = nn.Linear(2000,500)
        self.relu2 = nn.ReLU()

        self.linear3 = nn.Linear(500,100)
        self.relu3 = nn.ReLU()

        self.linear4 = nn.Linear(100,20)
        self.relu4 = nn.ReLU()

        self.linear5 = nn.Linear(20,1)

    def forward(self,x):
        out,_ = self.lstm(x)
        out = self.linear1(out)
        out = self.relu1(out)
        # flatten the output
        # out = out.view(-1,512)

        out = self.linear2(out)
        out = self.relu2(out)

        out = self.linear3(out)
        out = self.relu3(out)

        out = self.linear4(out)
        out = self.relu4(out)

        out = self.linear5(out)

        out = torch.sigmoid(out)

        return out

In [97]:
# Create a PyTorch model
model = LSTM()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)

# Loss function
error = nn.BCELoss()

In [51]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler(feature_range=(0,1))

# Create PyTorch datasets
# train_dataset_lstm = torch.utils.data.TensorDataset(
#     torch.from_numpy(
#         mms.fit_transform(torch.stack(tuple(train_data.tokens.values)).numpy()))
#         .resize(train_data.tokens.values.shape[0], 1, 512),
#                     torch.tensor([[float(0 == x), float(1 == x)] for x in train_data.Label.values]))
#
# test_dataset_lstm = torch.utils.data.TensorDataset(
#                 torch.from_numpy(
#                     mms.fit_transform(torch.stack(tuple(test_data.tokens.values)).numpy()))
#                     .resize(test_data.tokens.values.shape[0], 1, 512),
#                     torch.tensor([[float(0 == x), float(1 == x)] for x in test_data.Label.values]))

train_dataset_lstm = torch.utils.data.TensorDataset(
    torch.from_numpy(
        mms.fit_transform(torch.stack(tuple(train_data.tokens.values)).numpy()))
        .resize(train_data.tokens.values.shape[0], 1, 512),
                    torch.tensor(train_data.Label.values))

test_dataset_lstm = torch.utils.data.TensorDataset(
                torch.from_numpy(
                    mms.fit_transform(torch.stack(tuple(test_data.tokens.values)).numpy()))
                    .resize(test_data.tokens.values.shape[0], 1, 512),
                    torch.tensor(test_data.Label.values))

# Create PyTorch data loaders
train_loader_lstm = torch.utils.data.DataLoader(train_dataset_lstm, batch_size=32, shuffle=True)
test_loader_lstm = torch.utils.data.DataLoader(test_dataset_lstm, batch_size=32, shuffle=True)



In [107]:
# Train the model
epochs = 10
for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(train_loader_lstm):
        # Forward pass
        outputs = model(inputs)
        # outputs = torch.argmax(outputs, dim=-1)

        # Flatten the outputs
        outputs = outputs.view(-1)
        # outputs = outputs.float().clone().detach().requires_grad_(True)
        labels = labels.float()

        loss = error(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            writer.add_scalar('Loss training:', loss.item())
            writer.add_graph(model, inputs)
            print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                  .format(epoch + 1, epochs, i + 1, len(train_loader_lstm), loss.item()))

Epoch [1/10], Step [100/100], Loss: 0.0136
Epoch [2/10], Step [100/100], Loss: 0.0008
Epoch [3/10], Step [100/100], Loss: 0.1430
Epoch [4/10], Step [100/100], Loss: 0.0003
Epoch [5/10], Step [100/100], Loss: 0.0564
Epoch [6/10], Step [100/100], Loss: 0.0071
Epoch [7/10], Step [100/100], Loss: 0.0126
Epoch [8/10], Step [100/100], Loss: 0.0094
Epoch [9/10], Step [100/100], Loss: 0.0012
Epoch [10/10], Step [100/100], Loss: 0.0001


In [95]:
# torch.argmax(outputs, dim=-1)
outputs.view(-1).shape

torch.Size([64])

In [109]:
# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for inputs, labels in test_loader_lstm:
        outputs = model(inputs)
        outputs = outputs.view(-1)
        labels = labels.float()

        _, predicted = torch.max(outputs.data.reshape(-1,1), 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print('Accuracy of the model on the test images: {} %'.format(100 * correct / total))



Accuracy of the model on the test images: 53.258145363408524 %


In [None]:
!tensorboard --logdir=runs