# Fake news classification (RU)


In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

## Data preprocessing


In [2]:
from sklearn.model_selection import train_test_split
import nltk

nltk.download('stopwords')

train_data = pd.read_csv('dataset/train.tsv', sep='\t')
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

train_data.head()

[nltk_data] Downloading package stopwords to /home/danya-
[nltk_data]     sakharov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,title,is_fake
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1
1,Агент Кокорина назвал езду по встречке житейск...,0
2,Госдума рассмотрит возможность введения секрет...,1
3,ФАС заблокировала поставку скоростных трамваев...,0
4,Против Навального завели дело о недоносительст...,1


In [3]:
from preprocess import get_lemmas_from_text

train_data['lemmas'] = get_lemmas_from_text(train_data['title'])
test_data['lemmas'] = get_lemmas_from_text(test_data['title'])
train_data.head()

  0%|          | 0/5758 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,title,is_fake,lemmas
0,Москвичу Владимиру Клутину пришёл счёт за вмеш...,1,москвич владимир клутин приходить счет вмешате...
1,Агент Кокорина назвал езду по встречке житейск...,0,агент кокорин называть езда встречок житейский...
2,Госдума рассмотрит возможность введения секрет...,1,госдума рассматривать возможность введение сек...
3,ФАС заблокировала поставку скоростных трамваев...,0,фас заблокировать поставка скоростной трамвай ...
4,Против Навального завели дело о недоносительст...,1,против навальный заводить дело недоносительств...


In [4]:
x_train, x_val, y_train, y_val = train_test_split(
    train_data['lemmas'], train_data['is_fake'], test_size=0.35)

print("total train examples %s" % len(y_train))
print("total test examples %s" % len(y_val))

total train examples 3742
total test examples 2016


In [5]:
train_data.is_fake.value_counts()

1    2879
0    2879
Name: is_fake, dtype: int64

In [6]:
train_data.drop('title',axis=1, inplace=True)
test_data.drop('title', axis=1, inplace=True)

train, val = train_test_split(train_data, test_size=0.2, random_state=42)

In [7]:
train.to_csv('dataset/preprocessed/train.csv')
val.to_csv('dataset/preprocessed/val.csv')
test_data.to_csv('dataset/preprocessed/test.csv')

## PAC

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

tfidf = TfidfVectorizer()
train = tfidf.fit_transform(x_train)
val = tfidf.transform(x_val)

pac = PassiveAggressiveClassifier(C = 0.01)


pac.fit(train, y_train)
ypred = pac.predict(val)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, ypred)
print(f'Accuracy Score of Passive Aggresive Scassifier: {round(accuracy*100,2)}%')

Accuracy Score of Passive Aggresive Scassifier: 83.53%


## Fasttext baseline


In [9]:
import fasttext

In [10]:
with open('fasttext/data.train.txt', 'w+') as outfile:
    for x, y in zip(x_train, y_train):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')


with open('fasttext/test.txt', 'w+') as outfile:
    for x, y in zip(x_val, y_val):
        outfile.write('__label__' + str(y) + ' ' + x + '\n')

In [11]:
classifier = fasttext.train_supervised(
    'fasttext/data.train.txt', lr=1.0, epoch=25, wordNgrams=3)
    
num, precision, recall = classifier.test('fasttext/test.txt')

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print('Number of examples:', num)

Read 0M words
Number of words:  8392
Number of labels: 2


Precision: 0.8511904761904762
Recall: 0.8511904761904762
Number of examples: 2016


Progress: 100.0% words/sec/thread:  219680 lr:  0.000000 avg.loss:  0.024354 ETA:   0h 0m 0s


## CatBoost

In [20]:
from catboost import Pool, CatBoostClassifier

train_pool = Pool(
    pd.DataFrame(x_train), 
    y_train,
    text_features=['lemmas']
)
valid_pool = Pool(
    pd.DataFrame(x_val), 
    y_val,
    text_features=['lemmas']
)

catboost_params = {
    'iterations': 15000,
    'eval_metric': 'F1',
    'task_type': 'GPU',
    'early_stopping_rounds': 2000,
    'use_best_model': True,
    'verbose': 1500,
    'learning_rate': 0.05,
    'l2_leaf_reg': 3
}

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool)



0:	learn: 0.7734004	test: 0.8218504	best: 0.8218504 (0)	total: 43.8ms	remaining: 10m 57s
1500:	learn: 0.8891929	test: 0.8259557	best: 0.8268941 (1245)	total: 1m 3s	remaining: 9m 31s
3000:	learn: 0.9119651	test: 0.8232911	best: 0.8273780 (1598)	total: 2m 7s	remaining: 8m 29s
bestTest = 0.8273779567
bestIteration = 1598
Shrink model to first 1599 iterations.


<catboost.core.CatBoostClassifier at 0x7fbcd46ee250>

In [14]:
grid = {'learning_rate': [0.03, 0.1, 0.005],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

model = CatBoostClassifier()
randomized_search_result = model.randomized_search(grid,
                                                   X=val,
                                                   y=y_val,
                                                   plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6910157	test: 0.6919282	best: 0.6919282 (0)	total: 15.9ms	remaining: 15.9s
1:	learn: 0.6887259	test: 0.6903167	best: 0.6903167 (1)	total: 27.5ms	remaining: 13.7s
2:	learn: 0.6866157	test: 0.6885608	best: 0.6885608 (2)	total: 37.4ms	remaining: 12.4s
3:	learn: 0.6835047	test: 0.6869312	best: 0.6869312 (3)	total: 45.7ms	remaining: 11.4s
4:	learn: 0.6818605	test: 0.6859212	best: 0.6859212 (4)	total: 55.3ms	remaining: 11s
5:	learn: 0.6800685	test: 0.6842224	best: 0.6842224 (5)	total: 63.4ms	remaining: 10.5s
6:	learn: 0.6790034	test: 0.6835625	best: 0.6835625 (6)	total: 70.4ms	remaining: 9.99s
7:	learn: 0.6771388	test: 0.6827857	best: 0.6827857 (7)	total: 78.5ms	remaining: 9.73s
8:	learn: 0.6751656	test: 0.6807507	best: 0.6807507 (8)	total: 87.6ms	remaining: 9.64s
9:	learn: 0.6726558	test: 0.6789683	best: 0.6789683 (9)	total: 94.1ms	remaining: 9.32s
10:	learn: 0.6713767	test: 0.6778510	best: 0.6778510 (10)	total: 102ms	remaining: 9.13s
11:	learn: 0.6702463	test: 0.6771498	best: 0

In [21]:
from sklearn.metrics import classification_report,accuracy_score

pred = model.predict(pd.DataFrame(x_val))
print(classification_report(y_val,pred))
print(accuracy_score(y_val,pred))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       989
           1       0.86      0.80      0.83      1027

    accuracy                           0.83      2016
   macro avg       0.83      0.83      0.83      2016
weighted avg       0.83      0.83      0.83      2016

0.8298611111111112


## BERT


In [11]:
from transformers import AutoTokenizer

TOKENIZER_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
MODEL_NAME = "DeepPavlov/bert-base-bg-cs-pl-ru-cased"
SAVED_MODEL_NAME = MODEL_NAME.split("/", 1)[1]
MAX_SEQ_LENGTH = 200
BATCH_SIZE = 16

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)

In [12]:
from preprocess import convert_examples_to_inputs

train_features = convert_examples_to_inputs(
    x_train, y_train, MAX_SEQ_LENGTH, tokenizer)
val_features = convert_examples_to_inputs(
    x_val, y_val, MAX_SEQ_LENGTH, tokenizer)

In [13]:
import torch
from preprocess import get_data_loader

train_dataloader = get_data_loader(train_features, BATCH_SIZE, shuffle=True)
val_dataloader = get_data_loader(val_features, BATCH_SIZE, shuffle=False)
#test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=False)

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [15]:
from transformers.models.bert.modeling_bert import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(MODEL_NAME)
out_features = model.bert.encoder.layer[1].output.dense.out_features
model.classifier = torch.nn.Linear(out_features, 2)

model.to(device)

Some weights of the model checkpoint at DeepPavlov/bert-base-bg-cs-pl-ru-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassificat

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [16]:
from transformers.optimization import get_linear_schedule_with_warmup
from torch.optim import AdamW

GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 5
LEARNING_RATE = 1e-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5
OUTPUT_DIR = "trained_models/"
PATIENCE = 4

num_train_steps = int(len(train_dataloader.dataset) /
                      BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)
                      
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(
        nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(
        nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)

In [17]:
import torch
import os
from model import train

loss_fn = torch.nn.CrossEntropyLoss()
loss_history = []
acc_history = []
no_improvement = 0

config = (int(NUM_TRAIN_EPOCHS), GRADIENT_ACCUMULATION_STEPS, MAX_GRAD_NORM, SAVED_MODEL_NAME)
train(device, model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, config)

In [18]:
from model import val_report

model_state_dict = torch.load(os.path.join(OUTPUT_DIR, SAVED_MODEL_NAME), map_location=lambda storage, loc: storage)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME, state_dict=model_state_dict)

dev_correct, dev_predicted = val_report(device, model, train_dataloader, val_dataloader, loss_fn)

Evaluation iteration:   0%|          | 0/234 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/126 [00:00<?, ?it/s]

Training performance: (0.9826296098343132, 0.9826296098343132, 0.9826296098343132, None)
Development performance: (0.8859126984126984, 0.8859126984126984, 0.8859126984126984, None)
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       989
           1       0.89      0.89      0.89      1027

    accuracy                           0.89      2016
   macro avg       0.89      0.89      0.89      2016
weighted avg       0.89      0.89      0.89      2016



In [19]:
incorrect_samples = []
for feature, predict in zip(val_features, list(dev_correct != dev_predicted)):
    if predict:
        print(feature.text, feature.label_id) 
        incorrect_samples.append(feature.text)

ученый ради марс проживать восемь месяц изоляция гавайи 0
москвич пригрозить останавливать работа старый McDonalds россия 0
основной кормилец процент семья сша оказываться женщина 0
бритни спирс обвинять жестокий обращение ребенок 0
искусствовед картина мона лиза изображать мать иван грозный 1
огненный шоколад заинтересовывать российский орган власть 0
центробанк начинать размещать реклама новый купюра 1
медведев потребовать продавать Twitter российский компания 1
камасутра признавать олимпийский вид спорт 1
доллар рухнуть отказ ФНБ американский валюта 1
россия вступать сила мораторий действие довсе 0
борьба безработица правительство отменять социальный пособие 1
экономия кислород мкс катапультировать весь женщина 1
гендиректор белавиа самолет оправдывать 1
лондонский конкурс двойник мистер бин побеждать аутист саратов 1
летний аргентинка забеременеть прививка спутник V 1
прессслужба президент предсказывать серьезный проблема здоровье профессор соловей 1
пол маккартень приглашать свой 