# Fake news classification (RU)


In [1]:
import pandas as pd
import numpy as np

## Data preprocessing


In [2]:
train_data = pd.read_csv('dataset/train.tsv', sep='\t')
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

train_data.head()

Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [3]:
from multiprocessing import Pool
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re
import nltk
from string import punctuation
from pymystem3 import Mystem

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/danya-
[nltk_data]     sakharov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords

mystem_analyzer = Mystem()

def my_preproc(text):
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(f'|'.join(["»", "«", "—"]),'', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('[{}]'.format(punctuation), '', text)
    text = mystem_analyzer.lemmatize(text)

    return ' '.join([word for word in text if word not in stopwords.words('russian')+[' ', '\n', " "]])

In [5]:
def get_lemmas_from_text(text_series):
    with Pool(8) as pool:
        lemmas = list(tqdm(pool.imap(my_preproc, text_series), total=len(train_data)))
    return lemmas

In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [7]:
train_data['lemmas'] = get_lemmas_from_text(train_data['title'])
test_data['lemmas'] = get_lemmas_from_text(test_data['title'])
train_data.head()

  0%|          | 0/5758 [00:00<?, ?it/s]

  0%|          | 0/5758 [00:00<?, ?it/s]

Unnamed: 0,title,is_fake,lemmas
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1,москвич владимир клутин приходить счет вмешате...
1,Агент Кокорина назвал езду по встречке житейск...,0,агент кокорин называть езда встречок житейский...
2,Госдума рассмотрит возможность введения секрет...,1,госдума рассматривать возможность введение сек...
3,ФАС заблокировала поставку скоростных трамваев...,0,фас заблокировать поставка скоростной трамвай ...
4,Против Навального завели дело о недоносительст...,1,против навальный заводить дело недоносительств...


In [8]:
x_train, x_val, y_train , y_val = train_test_split(train_data['lemmas'], train_data['is_fake'], test_size=0.35)

print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_val))

total train examples 3742
total test examples 2016


## Fasttext baseline

In [9]:
import fasttext

In [10]:
with open('fasttext/data.train.txt', 'w+') as outfile:
    for x,y in zip(x_train, y_train):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')


with open('fasttext/test.txt', 'w+') as outfile:
    for x,y in zip(x_val, y_val):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')

In [11]:
train_data.is_fake.value_counts()

1    2879
0    2879
Name: is_fake, dtype: int64

In [12]:
classifier = fasttext.train_supervised('fasttext/data.train.txt', lr=1.0, epoch=25, wordNgrams=3)
num, presicion, recall = classifier.test('fasttext/test.txt')

print(f'Precision: {presicion}')
print(f'Recall: {recall}')
print('Number of examples:', num)

Read 0M words
Number of words:  8387
Number of labels: 2


Precision: 0.8258928571428571
Recall: 0.8258928571428571
Number of examples: 2016


Progress: 100.0% words/sec/thread:  443149 lr:  0.000000 avg.loss:  0.031069 ETA:   0h 0m 0s


## BERT

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [14]:
import logging
import numpy as np

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MAX_SEQ_LENGTH=100

class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        
def convert_examples_to_inputs(example_texts, example_labels, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""
    
    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label))

        
    return input_items

train_features = convert_examples_to_inputs(x_train, y_train, MAX_SEQ_LENGTH, tokenizer, verbose=0)
val_features = convert_examples_to_inputs(x_val, y_val, MAX_SEQ_LENGTH, tokenizer)


In [15]:
import torch
from torch.utils.data import TensorDataset, DataLoader

def get_data_loader(features, batch_size, shuffle=True): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

BATCH_SIZE = 16

train_dataloader = get_data_loader(train_features, BATCH_SIZE, shuffle=True)
val_dataloader = get_data_loader(val_features, BATCH_SIZE, shuffle=False)
#test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [17]:
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from transformers import BertConfig

config = BertConfig.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=1)
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", problem_type = "single_label_classification")
model.to(device)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [18]:
from transformers.optimization import get_linear_schedule_with_warmup
from torch.optim import AdamW
GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 25
LEARNING_RATE = 5e-7
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

In [19]:
def evaluate(model, dataloader):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for _, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs[0]
        logits = outputs[1]
        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += list(outputs)
        correct_labels += list(label_ids)
        
        eval_loss += loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels, predicted_labels

In [20]:
import torch
import os
from tqdm import trange
from sklearn.metrics import classification_report, precision_recall_fscore_support

OUTPUT_DIR = "trained_models/"
MODEL_FILE_NAME = "bert_model.bin"
PATIENCE = 2

loss_history = []
no_improvement = 0
for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
        loss = outputs[0]

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS
            
        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)  
            
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            
    dev_loss, dev_correct, dev_predicted = evaluate(model, val_dataloader)
    dev_acc = np.mean(dev_predicted == dev_correct)

    print(f"Loss history: {loss_history}")
    print(f"Dev loss: {dev_loss}")
    print(f"Dev accuracy: {dev_acc}")
    
    if len(loss_history) == 0 or dev_loss < min(loss_history):
        no_improvement = 0
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_file = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
    else:
        no_improvement += 1
    
    if no_improvement >= PATIENCE: 
        print("No improvement on development set. Finish training.")
        break
        
    
    loss_history.append(dev_loss)

Epoch:   0%|          | 0/25 [00:00<?, ?it/s]

Training iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Loss history: []
Dev loss: 0.5283551431364484
Dev accuracy: 0.785218253968254


Epoch:   4%|▍         | 1/25 [00:56<22:41, 56.72s/it]

Training iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Loss history: [0.5283551431364484]
Dev loss: 0.3223799830746083
Dev accuracy: 0.8660714285714286


Epoch:   8%|▊         | 2/25 [01:53<21:43, 56.67s/it]

Training iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Loss history: [0.5283551431364484, 0.3223799830746083]
Dev loss: 0.29621726573104895
Dev accuracy: 0.8819444444444444


Epoch:  12%|█▏        | 3/25 [02:50<20:46, 56.67s/it]

Training iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch:  16%|█▌        | 4/25 [03:38<18:45, 53.59s/it]

Loss history: [0.5283551431364484, 0.3223799830746083, 0.29621726573104895]
Dev loss: 0.3219051977161259
Dev accuracy: 0.8849206349206349


Training iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Epoch:  16%|█▌        | 4/25 [04:27<23:25, 66.92s/it]

Loss history: [0.5283551431364484, 0.3223799830746083, 0.29621726573104895, 0.3219051977161259]
Dev loss: 0.353393969336702
Dev accuracy: 0.8923611111111112
No improvement on development set. Finish training.





In [21]:
model_state_dict = torch.load(os.path.join(OUTPUT_DIR, MODEL_FILE_NAME), map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", state_dict=model_state_dict, problem_type='single_label_classification')
model.to(device)

model.eval()

_, train_correct, train_predicted = evaluate(model, train_dataloader)
_, dev_correct, dev_predicted = evaluate(model, val_dataloader)

print("Training performance:", precision_recall_fscore_support(train_correct, train_predicted, average="micro"))
print("Development performance:", precision_recall_fscore_support(dev_correct, dev_predicted, average="micro"))

bert_accuracy = np.mean(dev_predicted == dev_correct)

print(classification_report(dev_correct, dev_predicted))

Evaluation iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Training performance: (0.9540352752538749, 0.9540352752538749, 0.9540352752538749, None)
Development performance: (0.8819444444444444, 0.8819444444444444, 0.8819444444444444, None)
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       996
           1       0.89      0.87      0.88      1020

    accuracy                           0.88      2016
   macro avg       0.88      0.88      0.88      2016
weighted avg       0.88      0.88      0.88      2016

