In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torchtext.data import Field, BucketIterator, TabularDataset
from torchsummary import summary

from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef, classification_report, confusion_matrix

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
df = pd.read_csv('data_text/news.csv')
df.drop(df[df.text.str.len() < 5].index, inplace=True)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [5]:
def trim_string(x):
    x = x.split(maxsplit=200)
    x = ' '.join(x[:200])
    return x

In [6]:
label = {
    'REAL': 1,
    'FAKE': 0
}

In [7]:
df['text'] = df['text'].apply(lambda x: trim_string(x))
df['label'] = df['label'].apply(lambda x: label[x])

In [8]:
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=2020)
train_df.to_csv('data_text/train.csv', index=False)
valid_df.to_csv('data_text/valid.csv', index=False)

# Load Dataset

In [9]:
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)



In [10]:
fields = [('label', label_field), ('title', text_field), ('text', text_field)]

In [11]:
train, valid = TabularDataset.splits(path='data_text/', train='train.csv', validation='valid.csv', format='CSV', fields=fields, skip_header=True)



In [12]:
train_iter = BucketIterator(
    train, batch_size=32, 
    sort_key=lambda x: len(x.text), device=device,
    sort=True, sort_within_batch=True
)

valid_iter = BucketIterator(
    valid, batch_size=32, 
    sort_key=lambda x: len(x.text), device=device,
    sort=True, sort_within_batch=True
)



In [13]:
text_field.build_vocab(train, min_freq=3)

# Build Model

In [14]:
class TextClassifier(nn.Module):
    def __init__(self, dimension=128):
        super(TextClassifier, self).__init__()
        
        self.dimension = dimension
        self.embedding = nn.Embedding(len(text_field.vocab), 300)
        self.lstm = nn.LSTM(
            input_size=300,
            hidden_size=dimension,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.drop = nn.Dropout(0.5)
        self.fc = nn.Linear(2 * dimension, 1)
        
    def forward(self, text, text_len):
        text_emb = self.embedding(text)
        
        packed_input = pack_padded_sequence(text_emb, text_len.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        
        out_forward = output[range(len(output)), text_len - 1, : self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)
        
        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)
        
        return text_out

In [15]:
model = TextClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()

In [16]:
for i in tqdm(range(5)):
    running_loss = 0.0
    model.train()
    
    for (labels, (title, title_len), (text, text_len)), _ in train_iter: 
        optimizer.zero_grad()
        
        logits = model(text, text_len)
        loss = criterion(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    print('Epoch: {}, Loss: {}'.format(i, running_loss))

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))



Epoch: 0, Loss: -895897.0891952515
Epoch: 1, Loss: -16721903.965820312
Epoch: 2, Loss: -69785610.953125
Epoch: 3, Loss: -79648320.875
Epoch: 4, Loss: -81232396.6875

