In [1]:
from google.colab import drive
import os

drive.mount('/content/gdrive')
path = "/content/gdrive/My Drive/DongPN/Fake_news_detection"
os.chdir(path)

Mounted at /content/gdrive


In [59]:
# !pip3 install fairseq
# !pip3 install fastbpe
# !pip3 install vncorenlp
# !pip3 install transformers


# Load data

In [3]:
import pandas as pd
import numpy as np

PATH = "Data"
train_fe = pd.read_csv(PATH+'/train_posts.csv')
train_label = pd.read_csv(PATH+'/train_labels.csv')
test_fe = pd.read_csv(PATH+'/test_posts.csv')
test_label = pd.read_csv(PATH+'/test_labels.csv')
print("Train features shape:",train_fe.shape)
print("Train labels shape:",train_label.shape)
print("Test features shape:",test_fe.shape)
print("Test labels shape:",test_label.shape)

Train features shape: (1284, 5)
Train labels shape: (1284, 2)
Test features shape: (322, 5)
Test labels shape: (322, 2)


In [4]:
train_sents = train_fe['post_message'].values
train_labels = train_label['label'].values
test_text = test_fe['post_message'].values
test_labels = test_label['label'].values

In [5]:
from sklearn.model_selection import train_test_split

train_sents, val_sents, train_labels, val_labels = train_test_split(train_sents, train_labels, test_size=0.1, random_state=42)

In [6]:
print("Train Dataset:", train_sents.shape, train_labels.shape)
print("Validation Dataset:", val_sents.shape)
print("Test Dataset:", test_text.shape, test_labels.shape)

Train Dataset: (1155,) (1155,)
Validation Dataset: (129,)
Test Dataset: (322,) (322,)


In [7]:
# (pd.DataFrame(train_label['label'])).value_counts()

In [8]:
# (pd.DataFrame(test_labels)).value_counts()

# Posts feature prediction

## Convert sentence to vector

In [9]:
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
import argparse
from tqdm import tqdm

parser = argparse.ArgumentParser()
parser.add_argument('--bpe-codes', 
    default="PhoBERT_base_transformers/bpe.codes",
    required=False,
    type=str,
    help='path to fastBPE BPE'
)
args, unknown = parser.parse_known_args()
bpe = fastBPE(args)

# Load the dictionary
vocab = Dictionary()
vocab.add_from_file("PhoBERT_base_transformers/dict.txt")

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 256
def convert_line(sents, Bpe=bpe, Vocab=vocab):
  ids = []
  for sent in sents:
    subwords = '<s> ' + bpe.encode(sent) + ' </s>'
    encoded_sent = vocab.encode_line(subwords, append_eos=True, add_if_not_exist=False).long().tolist()
    ids.append(encoded_sent)
  id = pad_sequences(ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
  return id

In [11]:
train_ids = convert_line(train_sents)
val_ids = convert_line(val_sents)
test_ids = convert_line(test_text)
print(train_ids.shape)
print(val_ids.shape)
print(test_ids.shape)

(1155, 256)
(129, 256)
(322, 256)


## Create Masks

In [12]:
def create_masks(ids):
  masks = []
  for sent in ids:
    mask = [int(token_id > 0) for token_id in sent]
    masks.append(mask)
  return masks

train_masks = create_masks(train_ids)
val_masks = create_masks(val_ids)
test_masks =create_masks(test_ids)

## Load DataLoader

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch

def Load_dataloader(ids, labels, masks, batch_size=32):
  inputs = torch.tensor(ids)
  labels = torch.tensor(labels)
  masks = torch.tensor(masks)

  data = TensorDataset(inputs, masks, labels)
  sampler = SequentialSampler(data)
  dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

  return dataloader
  
train_dataloader = Load_dataloader(train_ids, train_labels, train_masks)
val_dataloader = Load_dataloader(val_ids, val_labels, val_masks)
test_dataloader = Load_dataloader(test_ids, test_labels, test_masks)

## Create model

In [29]:
# from transformers import RobertaForSequenceClassification, RobertaConfig, AdamW

# config = RobertaConfig.from_pretrained(
#     "PhoBERT_base_transformers/config.json", from_tf=False, num_labels = 2, output_hidden_states=False,
# )
# BERT = RobertaForSequenceClassification.from_pretrained(
#     "PhoBERT_base_transformers/model.bin",
#     config=config
# )

Some weights of the model checkpoint at PhoBERT_base_transformers/model.bin were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PhoBERT_base_transformers/model

In [30]:
# BERT.cuda()
# print('Done')

Done


## Evaluate

In [31]:
# import numpy as np
# from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# def flat_accuracy(preds, labels):
#     pred_flat = np.argmax(preds, axis=1).flatten()
#     labels_flat = labels.flatten()
    
#     F1_score = f1_score(pred_flat, labels_flat, average='macro')
#     cm = confusion_matrix(pred_flat, labels_flat)
    
#     return accuracy_score(pred_flat, labels_flat), F1_score, cm

## Train model

In [32]:
# import random
# from tqdm import tqdm_notebook
# device = 'cuda'
# epochs = 10

# param_optimizer = list(BERT.named_parameters())
# no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
# optimizer_grouped_parameters = [
#     {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#     {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
# ]

# optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5, correct_bias=False)


# for epoch_i in range(0, epochs):
#     print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
#     print('Training...')

#     total_loss = 0
#     BERT.train()
#     train_accuracy = 0
#     nb_train_steps = 0
#     train_f1 = 0
    
#     for step, batch in tqdm_notebook(enumerate(train_dataloader)):
#         b_input_ids = batch[0].to(device)
#         b_input_mask = batch[1].to(device)
#         b_labels = batch[2].to(device)

#         BERT.zero_grad()
#         outputs = BERT(b_input_ids, 
#             token_type_ids=None, 
#             attention_mask=b_input_mask, 
#             labels=b_labels)
#         loss = outputs[0]
#         total_loss += loss.item()
        
#         logits = outputs[1].detach().cpu().numpy()
#         label_ids = b_labels.to('cpu').numpy()
#         tmp_train_accuracy, tmp_train_f1, cm = flat_accuracy(logits, label_ids)
#         train_accuracy += tmp_train_accuracy
#         train_f1 += tmp_train_f1
#         nb_train_steps += 1
        
#         loss.backward()
#         torch.nn.utils.clip_grad_norm_(BERT.parameters(), 1.0)
#         optimizer.step()
        
#     avg_train_loss = total_loss / len(train_dataloader)
#     print(" Accuracy: {0:.4f}".format(train_accuracy/nb_train_steps))
#     print(" F1 score: {0:.4f}".format(train_f1/nb_train_steps))
#     print(cm)
#     print(" Average training loss: {0:.4f}".format(avg_train_loss))

#     print("Running Validation...")
#     BERT.eval()
#     eval_loss, eval_accuracy = 0, 0
#     nb_eval_steps, nb_eval_examples = 0, 0
#     eval_f1 = 0
#     for batch in tqdm_notebook(val_dataloader):

#         batch = tuple(t.to(device) for t in batch)

#         b_input_ids, b_input_mask, b_labels = batch

#         with torch.no_grad():
#             outputs = BERT(b_input_ids, 
#             token_type_ids=None, 
#             attention_mask=b_input_mask)
#             logits = outputs[0]
#             logits = logits.detach().cpu().numpy()
#             label_ids = b_labels.to('cpu').numpy()

#             tmp_eval_accuracy, tmp_eval_f1, cm = flat_accuracy(logits, label_ids)

#             eval_accuracy += tmp_eval_accuracy
#             eval_f1 += tmp_eval_f1
#             nb_eval_steps += 1
#     print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
#     print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))
#     print(cm)
# print("Training complete!")

Training...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.7601
 F1 score: 0.6770
[[1 0]
 [0 2]]
 Average training loss: 0.4997
Running Validation...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8812
 F1 score: 0.8749
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.8885
 F1 score: 0.8759
[[1 0]
 [0 2]]
 Average training loss: 0.2807
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8562
 F1 score: 0.8366
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9367
 F1 score: 0.9295
[[1 0]
 [0 2]]
 Average training loss: 0.1790
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8875
 F1 score: 0.8812
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9662
 F1 score: 0.9626
[[1 0]
 [0 2]]
 Average training loss: 0.1143
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.9062
 F1 score: 0.9010
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9662
 F1 score: 0.9616
[[1 0]
 [0 2]]
 Average training loss: 0.1068
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8875
 F1 score: 0.8824
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9806
 F1 score: 0.9781
[[1 0]
 [0 2]]
 Average training loss: 0.0618
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.9000
 F1 score: 0.8935
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9924
 F1 score: 0.9913
[[1 0]
 [0 2]]
 Average training loss: 0.0273
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8688
 F1 score: 0.8514
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9958
 F1 score: 0.9953
[[1 0]
 [0 2]]
 Average training loss: 0.0164
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8875
 F1 score: 0.8761
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9949
 F1 score: 0.9944
[[1 0]
 [0 2]]
 Average training loss: 0.0208
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8875
 F1 score: 0.8814
[[1]]
Training...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


 Accuracy: 0.9958
 F1 score: 0.9945
[[1 0]
 [0 2]]
 Average training loss: 0.0086
Running Validation...


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))


 Accuracy: 0.8750
 F1 score: 0.8667
[[1]]
Training complete!


In [37]:
predictions , true_labels = [], []
eval_accuracy = 0
eval_f1 = 0
nb_eval_steps = 0
# preds = []
for batch in test_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    outputs = BERT(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
  
  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  # print(np.argmax(logits, axis=1).flatten())
  # preds.append(np.argmax(logits, axis=1).flatten())

  tmp_eval_accuracy, tmp_eval_f1, cm = flat_accuracy(logits, label_ids)

  predictions.append(logits)
  true_labels.append(label_ids)
  eval_accuracy += tmp_eval_accuracy
  eval_f1 += tmp_eval_f1
  nb_eval_steps += 1
print(" Accuracy: {0:.4f}".format(eval_accuracy/nb_eval_steps))
print(" F1 score: {0:.4f}".format(eval_f1/nb_eval_steps))


 Accuracy: 0.8807
 F1 score: 0.8657


## Save model

In [103]:
# # Save model
Path = "models/Bert_classification_balance_data.pt"

In [39]:
# torch.save(BERT, Path)

## Load model

In [104]:
model = torch.load(Path)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=0)
      (position_embeddings): Embedding(258, 768, padding_idx=0)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

## Predict Sentences

In [58]:
def predict_sents(sents, model = model):
  ids = convert_line(sents)
  mask = create_masks(ids)
  loader = Load_dataloader(ids,np.zeros(ids.shape[0]),mask)
  pred_labels = []
  ## Predict
  for batch in loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    arr = np.argmax(logits, axis=1).flatten()
    for i in arr:
      pred_labels.append(i)

  return pred_labels

In [99]:
print(classification_report(test_labels, predict_sents(test_text, BERT)))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       203
           1       0.89      0.74      0.81       119

    accuracy                           0.87       322
   macro avg       0.87      0.84      0.85       322
weighted avg       0.87      0.87      0.87       322



In [145]:
predict_sents(["Đông đấm Nam hai cái"])

[1]

# Numbers Feature Prediction

In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

cols = ['num_like_post', 'num_comment_post', 'num_share_post']

X_train_num = train_fe[cols].values
y_train_num = train_label['label'].values
X_test_num = test_fe[cols].values
y_test_num = test_label['label'].values


In [71]:
RF = RandomForestClassifier()
RF.fit(X_train_num, y_train_num)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Evaluate

In [100]:
y_pred_num = RF.predict(X_test_num)
print(classification_report(y_test_num, y_pred_num))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       203
           1       0.71      0.71      0.71       119

    accuracy                           0.79       322
   macro avg       0.77      0.77      0.77       322
weighted avg       0.79      0.79      0.79       322



# Combine models

In [86]:
X_pred_sent_train = predict_sents(train_fe['post_message'].values) ## Feature sentences
X_pred_sent_test = predict_sents(test_fe['post_message'].values)

In [87]:
X_pred_num_train = RF.predict(X_train_num) ## Feature numbers
X_pred_num_test = RF.predict(X_test_num)

In [88]:
X_train_combine = np.vstack([X_pred_sent_train,X_pred_num_train]).T
X_test_combine = np.vstack([X_pred_sent_test,X_pred_num_test]).T
y_train_combine = train_label['label'].values
y_test_combine = test_label['label'].values
print("X train data:",X_train_combine.shape)
print("y train data:",y_train_combine.shape)
print("X test data:",X_test_combine.shape)
print("y test data:",y_test_combine.shape)

X train data: (1284, 2)
y train data: (1284,)
X test data: (322, 2)
y test data: (322,)


In [94]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(C=200, random_state=42)
LR.fit(X_train_combine, y_train_combine)

LogisticRegression(C=200, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Evaluate Combine Model

In [101]:
y_pred_combine = LR.predict(X_test_combine)

print(classification_report(y_test_combine, y_pred_combine))

              precision    recall  f1-score   support

           0       0.96      0.80      0.87       203
           1       0.73      0.95      0.83       119

    accuracy                           0.85       322
   macro avg       0.85      0.87      0.85       322
weighted avg       0.88      0.85      0.86       322



In [102]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test_combine, y_pred_combine))

[[162  41]
 [  6 113]]
