In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import classification_report
import torch
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import string

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = BertForSequenceClassification.from_pretrained("./fake_news_classifier")
tokenizer = BertTokenizer.from_pretrained("./fake_news_classifier")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [5]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def preprocess_text(text):

    def remove_reuters_prefix(text):
        pattern = r'^[\s\S]*?\(reuters\) - '
        return re.sub(pattern, '', text)

    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def remove_square_brackets(text):
        return re.sub('\[[^]]*\]', '', text)

    def remove_urls(text):
        return re.sub(r'http\S+', '', text)

    def remove_stopwords(text):
        final_text = []
        for i in text.split():
            if i.strip().lower() not in stop:
                final_text.append(i.strip())
        return " ".join(final_text)

    text = text.lower()
    text = remove_reuters_prefix(text)
    text = strip_html(text)
    text = remove_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)

    return text

In [6]:
def predict(text_list):
    # Preprocess the text
    preprocessed_texts = [preprocess_text(text) for text in text_list]

    # Tokenize the text
    inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, return_tensors="pt").to(device)

    # Make predictions
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    
    return predictions

In [7]:
# df_new_examples = pd.read_csv("fake_news_generated_by_chatgpt.csv")
df_new_examples = pd.read_csv("manual_testing.csv")

In [8]:
df_new_examples.head()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
0,23431,"BOILER ROOM – EP #46 – Murder, Witchery, Polit...",Tune in to the Alternate Current Radio Network...,Middle-east,"March 3, 2016",0
1,23432,Survey: Top Ten Fears of 2015,Truthstream MediaOut of 88 potential horrors ...,Middle-east,"February 29, 2016",0
2,23433,SHOUT POLL: Should Apple Give FBI Backdoor Acc...,21st Century Wire asks HAVE YOUR SHOUT: Apple...,Middle-east,"February 27, 2016",0
3,23434,"The Final Control: TPP, TTIP, TISA Global Corp...",21st Century Wire says This is a new geopoliti...,Middle-east,"February 27, 2016",0
4,23435,Blood Sport: GOP Presidential Race Takes Anoth...,Patrick Henningsen 21st Century WireAs Baron R...,Middle-east,"February 26, 2016",0


In [9]:
df_new_examples.tail()

Unnamed: 0.1,Unnamed: 0,title,text,subject,date,label
95,21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
96,21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
97,21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
98,21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
99,21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1


In [10]:
texts = df_new_examples['text'].tolist()
actual_labels = df_new_examples['label'].tolist()

In [11]:
predicted_labels = predict(texts)

In [12]:
# print(classification_report(actual_labels, predicted_labels, target_names=["Fake", "True"]))
print(classification_report(actual_labels, predicted_labels))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [13]:
df_new_examples['predicted_label'] = predicted_labels

In [14]:
# print(classification_report(actual_labels, predicted_labels, target_names=["Fake", "True"]))
print(classification_report(actual_labels, predicted_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [15]:
df_new_examples[['text', 'label', 'predicted_label']]

Unnamed: 0,text,label,predicted_label
0,Tune in to the Alternate Current Radio Network...,0,0
1,Truthstream MediaOut of 88 potential horrors ...,0,0
2,21st Century Wire asks HAVE YOUR SHOUT: Apple...,0,0
3,21st Century Wire says This is a new geopoliti...,0,0
4,Patrick Henningsen 21st Century WireAs Baron R...,0,0
...,...,...,...
95,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1,1
96,"LONDON (Reuters) - LexisNexis, a provider of l...",1,1
97,MINSK (Reuters) - In the shadow of disused Sov...,1,1
98,MOSCOW (Reuters) - Vatican Secretary of State ...,1,1
