# Fake news classification (RU)


In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)

## Data preprocessing


In [61]:
train_data = pd.read_csv('dataset/train.tsv', sep='\t')
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

train_data.head()

Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [62]:
from multiprocessing import Pool
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re
import nltk
from string import punctuation
from pymystem3 import Mystem

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/danya-
[nltk_data]     sakharov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
from nltk.corpus import stopwords

mystem_analyzer = Mystem()

def base_preprocessing(text):
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(f'|'.join(["»", "«", "—"]), '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('[{}]'.format(punctuation), '', text)
    text = mystem_analyzer.lemmatize(text)

    return ' '.join([word for word in text if word not in stopwords.words('russian')+[' ', '\n', " "]])

In [64]:
def get_lemmas_from_text(text_series):
    mystem_analyzer = Mystem()
    with Pool(8) as pool:
        lemmas = list(
            tqdm(pool.imap(base_preprocessing, text_series), total=len(text_series)))
    return lemmas

In [65]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [66]:
train_data['lemmas'] = get_lemmas_from_text(train_data['title'])
test_data['lemmas'] = get_lemmas_from_text(test_data['title'])
train_data.head()

TypeError: '<' not supported between instances of 'Mystem' and 'int'

In [23]:
x_train, x_val, y_train, y_val = train_test_split(
    train_data['lemmas'], train_data['is_fake'], test_size=0.35)

print("total train examples %s" % len(y_train))
print("total test examples %s" % len(y_val))

total train examples 3742
total test examples 2016


In [24]:
train_data.is_fake.value_counts()

1    2879
0    2879
Name: is_fake, dtype: int64

In [25]:
train_data.drop('title',axis=1, inplace=True)
test_data.drop('title', axis=1, inplace=True)
train, val = train_test_split(train_data, test_size=0.2, random_state=42)

In [26]:
train.to_csv('dataset/preprocessed/train.csv')
val.to_csv('dataset/preprocessed/val.csv')
test_data.to_csv('dataset/preprocessed/test.csv')

## Fasttext baseline


In [12]:
import fasttext

In [13]:
with open('fasttext/data.train.txt', 'w+') as outfile:
    for x, y in zip(x_train, y_train):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')


with open('fasttext/test.txt', 'w+') as outfile:
    for x, y in zip(x_val, y_val):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')

In [14]:
classifier = fasttext.train_supervised(
    'fasttext/data.train.txt', lr=1.0, epoch=25, wordNgrams=3)
num, presicion, recall = classifier.test('fasttext/test.txt')

print(f'Precision: {presicion}')
print(f'Recall: {recall}')
print('Number of examples:', num)

Read 0M words
Number of words:  8392
Number of labels: 2


Precision: 0.8447420634920635
Recall: 0.8447420634920635
Number of examples: 2016


Progress: 100.0% words/sec/thread:  443181 lr:  0.000000 avg.loss:  0.030531 ETA:   0h 0m 0s


## BERT


In [27]:
from transformers import AutoTokenizer

TOKENIZER_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
MODEL_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
SAVED_MODEL_NAME = MODEL_NAME.split("/", 1)[1]
MAX_SEQ_LENGTH = 200
BATCH_SIZE = 16

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [28]:
import logging
import numpy as np

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)


class BertInputItem(object):
    """An item with all the necessary attributes for finetuning BERT."""

    def __init__(self, text, input_ids, input_mask, segment_ids, label_id):
        self.text = text
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


def convert_examples_to_inputs(example_texts, example_labels, max_seq_length, tokenizer, verbose=0):
    """Loads a data file into a list of `InputBatch`s."""

    input_items = []
    examples = zip(example_texts, example_labels)
    for (ex_index, (text, label)) in enumerate(examples):

        # Create a list of token ids
        input_ids = tokenizer.encode(f"[CLS] {text} [SEP]")
        if len(input_ids) > max_seq_length:
            input_ids = input_ids[:max_seq_length]

        # All our tokens are in the first input segment (id 0).
        segment_ids = [0] * len(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        input_items.append(
            BertInputItem(text=text,
                          input_ids=input_ids,
                          input_mask=input_mask,
                          segment_ids=segment_ids,
                          label_id=label))

    return input_items


train_features = convert_examples_to_inputs(
    x_train, y_train, MAX_SEQ_LENGTH, tokenizer, verbose=0)
val_features = convert_examples_to_inputs(
    x_val, y_val, MAX_SEQ_LENGTH, tokenizer)

In [29]:
import torch
from torch.utils.data import TensorDataset, DataLoader


def get_data_loader(features, batch_size, shuffle=True):

    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor(
        [f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask,
                         all_segment_ids, all_label_ids)

    dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader


train_dataloader = get_data_loader(train_features, BATCH_SIZE, shuffle=True)
val_dataloader = get_data_loader(val_features, BATCH_SIZE, shuffle=False)
#test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [31]:
from transformers.models.bert.modeling_bert import BertForSequenceClassification

#config = BertConfig.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=1)

model = BertForSequenceClassification.from_pretrained(MODEL_NAME)

out_features = model.bert.encoder.layer[1].output.dense.out_features
model.classifier = torch.nn.Linear(out_features, 2)

model.to(device)

Some weights of the model checkpoint at DeepPavlov/bert-base-bg-cs-pl-ru-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificat

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [32]:
from transformers.optimization import get_linear_schedule_with_warmup
from torch.optim import AdamW

GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 35
LEARNING_RATE = 1e-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset) /
                      BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(
        nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(
        nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

In [33]:
def evaluate(model, dataloader, loss_fn):
    model.eval()

    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    for _, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=input_mask,
                            token_type_ids=segment_ids, labels=label_ids)
        logits = outputs[1]

        eval_loss = loss_fn(outputs.logits, label_ids)

        label_ids = label_ids.to('cpu').numpy()
        outputs = np.argmax(logits.to('cpu'), axis=1)

        predicted_labels += list(outputs)
        correct_labels += list(label_ids)

        eval_loss += eval_loss.item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps

    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)

    return eval_loss, correct_labels, predicted_labels

In [36]:
import torch
import os
from tqdm import trange
from sklearn.metrics import classification_report, precision_recall_fscore_support

OUTPUT_DIR = "trained_models/"
PATIENCE = 4
loss_fn = torch.nn.CrossEntropyLoss()
loss_history = []
acc_history = []
no_improvement = 0
# for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
#     model.train()
#     tr_loss = 0
#     nb_tr_examples, nb_tr_steps = 0, 0
#     for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
#         batch = tuple(t.to(device) for t in batch)
#         input_ids, input_mask, segment_ids, label_ids = batch

#         outputs = model(input_ids, attention_mask=input_mask,
#                         token_type_ids=segment_ids, labels=label_ids)
#         loss = outputs[0]

#         preds = torch.argmax(outputs.logits, dim=1)
#         loss = loss_fn(outputs.logits, label_ids)

#         if GRADIENT_ACCUMULATION_STEPS > 1:
#             loss = loss / GRADIENT_ACCUMULATION_STEPS

#         loss.backward()
#         tr_loss += loss.item()

#         if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
#             torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

#             optimizer.step()
#             optimizer.zero_grad()
#             scheduler.step()

#     dev_loss, dev_correct, dev_predicted = evaluate(
#         model, val_dataloader, loss_fn)
#     dev_acc = np.mean(dev_predicted == dev_correct)

#     if (len(loss_history) > 3):
#         print(f"Loss history: {loss_history[-3:]}")
#     else:
#         print(f"Loss history: {loss_history}")
#     print(f"Dev loss: {dev_loss}")
#     print(f"Dev accuracy: {dev_acc}")

#     if len(acc_history) == 0 or dev_acc > max(acc_history):
#         print('New record, model saved')
#         no_improvement = 0
#         model_to_save = model.module if hasattr(model, 'module') else model
#         output_model_file = os.path.join(OUTPUT_DIR, SAVED_MODEL_NAME)
#         torch.save(model_to_save.state_dict(), output_model_file)
#     elif dev_acc < acc_history[-1]:
#         no_improvement += 1

#     if no_improvement > PATIENCE:
#         print("No improvement on development set. Finish training.")
#         break

#     loss_history.append(dev_loss.item())
#     acc_history.append(dev_acc)

In [37]:
model_state_dict = torch.load(os.path.join(OUTPUT_DIR, SAVED_MODEL_NAME), map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, state_dict=model_state_dict)

model.to(device)
model.eval()

_, train_correct, train_predicted = evaluate(model, train_dataloader, loss_fn)
_, dev_correct, dev_predicted = evaluate(model, val_dataloader, loss_fn)

print("Training performance:", precision_recall_fscore_support(
    train_correct, train_predicted, average="micro"))
print("Development performance:", precision_recall_fscore_support(
    dev_correct, dev_predicted, average="micro"))

bert_accuracy = np.mean(dev_predicted == dev_correct)

print(classification_report(dev_correct, dev_predicted))

Evaluation iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Training performance: (1.0, 1.0, 1.0, None)
Development performance: (0.8988095238095238, 0.8988095238095238, 0.8988095238095238, None)
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       989
           1       0.89      0.91      0.90      1027

    accuracy                           0.90      2016
   macro avg       0.90      0.90      0.90      2016
weighted avg       0.90      0.90      0.90      2016



In [56]:
incorrect_samples = []
for feature, predict in zip(val_features, list(dev_correct != dev_predicted)):
    if predict:
        print(feature.text, feature.label_id) 
        incorrect_samples.append(feature.text)

ученый ради марс проживать восемь месяц изоляция гавайи 0
москвич пригрозить останавливать работа старый McDonalds россия 0
основной кормилец процент семья сша оказываться женщина 0
бритни спирс обвинять жестокий обращение ребенок 0
искусствовед картина мона лиза изображать мать иван грозный 1
огненный шоколад заинтересовывать российский орган власть 0
медведев потребовать продавать Twitter российский компания 1
кудрин призывать перевести россия ветроэнергетика прямо 1
камасутра признавать олимпийский вид спорт 1
доллар рухнуть отказ ФНБ американский валюта 1
россия вступать сила мораторий действие довсе 0
борьба безработица правительство отменять социальный пособие 1
экономия кислород мкс катапультировать весь женщина 1
гендиректор белавиа самолет оправдывать 1
лондонский конкурс двойник мистер бин побеждать аутист саратов 1
летний аргентинка забеременеть прививка спутник V 1
пол маккартень приглашать свой концерт виртуальный реальность 0
анатолий чубайс возглавлять сбербанк 1
госдеп 