In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

In [3]:
model = BertForSequenceClassification.from_pretrained("./fake_news_classifier")
tokenizer = BertTokenizer.from_pretrained("./fake_news_classifier")

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def preprocess_text(text):

    def remove_reuters_prefix(text):
        pattern = r'^[\s\S]*?\(reuters\) - '
        return re.sub(pattern, '', text)

    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def remove_square_brackets(text):
        return re.sub('\[[^]]*\]', '', text)

    def remove_urls(text):
        return re.sub(r'http\S+', '', text)

    def remove_stopwords(text):
        final_text = []
        for i in text.split():
            if i.strip().lower() not in stop:
                final_text.append(i.strip())
        return " ".join(final_text)

    text = text.lower()
    text = remove_reuters_prefix(text)
    text = strip_html(text)
    text = remove_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)

    return text

In [7]:
def predict(text_list, batch_size=4):
    # Preprocess the text
    preprocessed_texts = [preprocess_text(text) for text in text_list]
    
    all_predictions = []

    # Process texts in batches
    for i in range(0, len(preprocessed_texts), batch_size):
        batch_texts = preprocessed_texts[i:i+batch_size]

        # Tokenize the text
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt").to(device)

        # Make predictions
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=-1).cpu().numpy()
        all_predictions.extend(batch_predictions)
        
        # Clear CUDA cache to manage memory
        torch.cuda.empty_cache()
    
    return all_predictions

In [8]:
df_fake = pd.read_csv("validation_data/football_news_fake.csv")
df_true = pd.read_csv("validation_data/football_news_real.csv")
df_fake["label"] = 0
df_true["label"] = 1
df_new_examples = pd.concat([df_fake, df_true], axis=0)
df_new_examples = df_new_examples.sample(frac = 1)
df_new_examples.reset_index(inplace = True)
df_new_examples.drop(["index"], axis = 1, inplace = True)

In [9]:
df_new_examples.isnull().sum()

tweet    17
label     0
dtype: int64

In [10]:
df_new_examples.dropna(inplace=True)

In [11]:
df_new_examples.isnull().sum()

tweet    0
label    0
dtype: int64

In [12]:
df_new_examples.head()

Unnamed: 0,tweet,label
0,"mahmoud jaber, the coach of the youth team, se...",1
1,the most important information about the al-ah...,1
2,my internet is suspended every time i open the...,0
3,curse or cheating? al-ghanem wins\nnot to the ...,0
4,italy teams played 10 matches in this round of...,0


In [13]:
df_new_examples.tail()

Unnamed: 0,tweet,label
41863,fandyke is a loser stretching his leg while th...,0
41864,5 thousand spectators in the friendly team aga...,1
41865,"the first half of the enppi-aswan match, which...",1
41866,"at half past two in the afternoon, thursday, 7...",1
41867,congratulations to the tarshi team. as long as...,0


In [14]:
texts = df_new_examples['tweet'].tolist()
actual_labels = df_new_examples['label'].tolist()

In [15]:
predicted_labels = predict(texts)

  soup = BeautifulSoup(text, "html.parser")


In [16]:
print(classification_report(actual_labels, predicted_labels, target_names=["Fake", "True"]))

              precision    recall  f1-score   support

        Fake       0.63      0.95      0.75     19988
        True       0.91      0.48      0.63     21863

    accuracy                           0.71     41851
   macro avg       0.77      0.72      0.69     41851
weighted avg       0.78      0.71      0.69     41851



In [17]:
df_new_examples['predicted_label'] = predicted_labels

In [18]:
df_new_examples[['tweet', 'label', 'predicted_label']]

Unnamed: 0,tweet,label,predicted_label
0,"mahmoud jaber, the coach of the youth team, se...",1,1
1,the most important information about the al-ah...,1,1
2,my internet is suspended every time i open the...,0,0
3,curse or cheating? al-ghanem wins\nnot to the ...,0,0
4,italy teams played 10 matches in this round of...,0,0
...,...,...,...
41863,fandyke is a loser stretching his leg while th...,0,0
41864,5 thousand spectators in the friendly team aga...,1,0
41865,"the first half of the enppi-aswan match, which...",1,1
41866,"at half past two in the afternoon, thursday, 7...",1,1
