In [17]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import string

In [18]:
torch.cuda.empty_cache()

In [19]:
model = BertForSequenceClassification.from_pretrained("./fake_news_classifier")
tokenizer = BertTokenizer.from_pretrained("./fake_news_classifier")

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [21]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [22]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def preprocess_text(text):

    def remove_reuters_prefix(text):
        pattern = r'^[\s\S]*?\(reuters\) - '
        return re.sub(pattern, '', text)

    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def remove_square_brackets(text):
        return re.sub('\[[^]]*\]', '', text)

    def remove_urls(text):
        return re.sub(r'http\S+', '', text)

    def remove_stopwords(text):
        final_text = []
        for i in text.split():
            if i.strip().lower() not in stop:
                final_text.append(i.strip())
        return " ".join(final_text)

    text = text.lower()
    text = remove_reuters_prefix(text)
    text = strip_html(text)
    text = remove_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)

    return text

In [23]:
def predict(text_list, batch_size=4):
    # Preprocess the text
    preprocessed_texts = [preprocess_text(text) for text in text_list]
    
    all_predictions = []

    # Process texts in batches
    for i in range(0, len(preprocessed_texts), batch_size):
        batch_texts = preprocessed_texts[i:i+batch_size]

        # Tokenize the text
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

        # Make predictions
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        all_predictions.extend(batch_predictions)
        
        # Clear CUDA cache to manage memory
        torch.cuda.empty_cache()
    
    return all_predictions

In [24]:
df_new_examples = pd.read_csv("validation_data/news_syria.csv", encoding='latin')

In [25]:
df_new_examples.isnull().sum()

unit_id            0
article_title      0
article_content    0
source             0
date               0
location           0
labels             0
dtype: int64

In [26]:
df_new_examples.dropna(inplace=True)

In [27]:
df_new_examples.isnull().sum()

unit_id            0
article_title      0
article_content    0
source             0
date               0
location           0
labels             0
dtype: int64

In [28]:
df_new_examples.head()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
0,1914947530,Syria attack symptoms consistent with nerve ag...,Wed 05 Apr 2017 Syria attack symptoms consiste...,nna,4/5/2017,idlib,0
1,1914947532,Homs governor says U.S. attack caused deaths b...,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,nna,4/7/2017,homs,0
2,1914947533,Death toll from Aleppo bomb attack at least 112,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,nna,4/16/2017,aleppo,0
3,1914947534,Aleppo bomb blast kills six Syrian state TV,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,nna,4/19/2017,aleppo,0
4,1914947535,29 Syria Rebels Dead in Fighting for Key Alepp...,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,nna,7/10/2016,aleppo,0


In [29]:
df_new_examples.tail()

Unnamed: 0,unit_id,article_title,article_content,source,date,location,labels
799,1965511221,Turkish Bombardment Kills 20 Civilians in Syria,28-08-2016 Turkish Bombardment Kills 20 Civili...,manar,8/28/2016,aleppo,1
800,1965511222,Martyrs as Terrorists Shell Aleppos Salah Eddin,17-08-2016 Martyrs as Terrorists Shell Aleppos...,manar,8/1/2016,aleppo,1
801,1965511224,Chemical Attack Kills Five Syrians in Aleppo SANA,03-08-2016 Chemical Attack Kills Five Syrians ...,manar,8/3/2016,aleppo,0
802,1965511226,5 Killed as Russian Military Chopper Shot down...,01-08-2016 5 Killed as Russian Military Choppe...,manar,8/1/2016,idlib,1
803,1965511231,Syrian Army Kills 48 ISIL Terrorists in Deir E...,April 6 2017 Syrian Army Kills 48 ISIL Terrori...,manar,4/4/2017,deir ezzor,1


In [30]:
texts = df_new_examples['article_content'].tolist()
actual_labels = df_new_examples['labels'].tolist()

In [31]:
predicted_labels = predict(texts)

In [32]:
print(classification_report(actual_labels, predicted_labels, target_names=["Fake", "True"]))

              precision    recall  f1-score   support

        Fake       0.49      0.23      0.31       378
        True       0.53      0.79      0.64       426

    accuracy                           0.52       804
   macro avg       0.51      0.51      0.47       804
weighted avg       0.51      0.52      0.48       804



In [33]:
df_new_examples['predicted_label'] = predicted_labels

In [34]:
df_new_examples[['article_content', 'labels', 'predicted_label']]

Unnamed: 0,article_content,labels,predicted_label
0,Wed 05 Apr 2017 Syria attack symptoms consiste...,0,1
1,Fri 07 Apr 2017 at 0914 Homs governor says U.S...,0,0
2,Sun 16 Apr 2017 Death toll from Aleppo bomb at...,0,1
3,Wed 19 Apr 2017 Aleppo bomb blast kills six Sy...,0,1
4,Sun 10 Jul 2016 29 Syria Rebels Dead in Fighti...,0,1
...,...,...,...
799,28-08-2016 Turkish Bombardment Kills 20 Civili...,1,1
800,17-08-2016 Martyrs as Terrorists Shell Aleppos...,1,1
801,03-08-2016 Chemical Attack Kills Five Syrians ...,0,1
802,01-08-2016 5 Killed as Russian Military Choppe...,1,1
