In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-and-real-news-dataset/True.csv
/kaggle/input/fake-and-real-news-dataset/Fake.csv


##### SETUP ENV

In [14]:
# !pip install -q transformers datasets accelerate
# !pip install nltk

In [32]:
import os
import re
import string
import numpy as np
import pandas as pd
from tqdm import tqdm

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [16]:
true_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
fake_df = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

true_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [17]:
true_df["label"] = "True"
fake_df["label"] = "Fake"

# Reset indeks DataFrame
true_df = true_df.reset_index(drop=True)
fake_df = fake_df.reset_index(drop=True)

# Merge DataFrames by setting ignore_index=True
dataframe = pd.concat([true_df, fake_df], ignore_index=True)

In [18]:
dataframe.head(10)

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",True
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",True
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",True
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",True
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",True
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017",True
6,"Trump says Russia probe will be fair, but time...","WEST PALM BEACH, Fla (Reuters) - President Don...",politicsNews,"December 29, 2017",True
7,Factbox: Trump on Twitter (Dec 29) - Approval ...,The following statements were posted to the ve...,politicsNews,"December 29, 2017",True
8,Trump on Twitter (Dec 28) - Global Warming,The following statements were posted to the ve...,politicsNews,"December 29, 2017",True
9,Alabama official to certify Senator-elect Jone...,WASHINGTON (Reuters) - Alabama Secretary of St...,politicsNews,"December 28, 2017",True


## Text Cleaning & Processing

In [35]:
def text_cleaning(text: str) -> str:
    # delete number
    text = text.translate(str.maketrans("", "", string.digits))

    # sstring to lower
    text = text.lower()

    # Delete text in square brackets, such as [text]
    text = re.sub('\[.*?\]', '', text)

    # delete web (URL)
    text = re.sub('https?://\S+|www\.\S+', '', text)

    # delete tag HTML
    text = re.sub('<.*?>+', '', text)

    # Removing punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)

    # Delete new line characters
    text = re.sub('\n', '', text)

    # Delete words that contain numbers
    text = re.sub('\w*\d\w*', '', text)

    return text

def text_preprocessing(text: str) -> str:
    # Tokenisasi tekt RegexpTokenizer
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

    # Text cleaning
    cleaning_text = text_cleaning(text)

    # Tokenize clean text
    tokenized_text = tokenizer.tokenize(cleaning_text)
    
    # Removing stopwords (common words that are often irrelevant)
    remove_stopwords = [word for word in tokenized_text if word not in stopwords.words("english")]
    
    # Combining the remaining words into clean text
    combined_text = " ".join(remove_stopwords)

    return combined_text


In [20]:
dataframe["text"] = dataframe["text"] + dataframe["title"]

In [21]:
dataframe.loc[0, "text"]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [36]:
tqdm.pandas()
dataframe["cleaned_text"] = dataframe["text"].progress_apply(lambda text: text_preprocessing(text))

100%|██████████| 44898/44898 [30:10<00:00, 24.79it/s] 


In [38]:
dataframe = dataframe[["cleaned_text", "label"]]

In [42]:
labels = dataframe["label"].unique().tolist()
id2label = {k: v for k, v in enumerate(labels)}
label2id = {v: k for k, v in enumerate(labels)}
dataframe["label"] = dataframe["label"].progress_apply(lambda value: label2id[value])

100%|██████████| 44898/44898 [00:00<00:00, 992818.68it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["label"] = dataframe["label"].progress_apply(lambda value: label2id[value])


## Modeling

In [43]:
import torch
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [44]:
# Set Parameters
MODEL_ID = "bert-base-uncased"
EPOCH = 2
BATCH_SIZE = 20
LR = 1e-5

In [45]:
# Split dataset
train, test = train_test_split(
    dataframe, test_size=0.2, random_state=856,
    stratify=dataframe["label"]
)

# Pandas to format HF
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [46]:
# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

def processing_data(rows):
    encoded_input = tokenizer.batch_encode_plus(
        rows["cleaned_text"],
        padding = "max_length",
        truncation = True
    )

    encoded_input["labels"] = rows["label"]

    return encoded_input


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [47]:
train_encoded_dataset = train_dataset.map(
    processing_data, batched=True,
    remove_columns=train_dataset.column_names, batch_size=BATCH_SIZE
)

test_encoded_dataset = test_dataset.map(
    processing_data, batched = True,
    remove_columns=test_dataset.column_names
)

  0%|          | 0/1796 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

In [52]:
# Set format to Pytorch "cuda" or "cpu"
train_encoded_dataset.set_format(type="torch", device="cuda")
test_encoded_dataset.set_format(type="torch", device="cuda")

In [73]:
train_dataloader = DataLoader(train_encoded_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_encoded_dataset, batch_size=2)   # default BATCH_SIZE = 2

In [74]:
batch = next(iter(test_dataloader))
for k,v in batch.items():
    print(k, v.shape)

input_ids torch.Size([2, 512])
token_type_ids torch.Size([2, 512])
attention_mask torch.Size([2, 512])
labels torch.Size([2])


In [75]:
tokenizer.decode(batch['input_ids'][0].tolist())

'[CLS] munichpotsdam germany reuters people including hecklers blowing whistles showed munich one german chancellor angela merkel final speeches sunday national election expected sweep fourth term merkel whose conservatives solid doubledigit lead social democrats largely ignored jeers hundreds left rightwing demonstrators deliver stump speech focused stability security promise avoid tax increases get lost merkel must go shouted demonstrators curious foreign tourists munich famous oktoberfest snapped photographs german leader first elected merkel faced similar heckling many rallies especially former communist east admonished peaceful boisterous crowd whistling yelling certainly ensure future country merkel defended decision allow one million migrants humanitarian necessity said would prevent repeat migrant crisis fund programs atrisk countries keep people fleeing happened cannot repeated merkel said saying would protect europe borders berlin gendarmenmarkt merkel main rival spd leader m

## Compute Metrics

In [56]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification, AutoConfig

In [59]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [60]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID, id2label=id2label, label2id=label2id,
    num_labels=len(labels)
)
model.to(device)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [61]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "True",
    "1": "Fake"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "Fake": 1,
    "True": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## Train

In [76]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

class ClassificationTrainer(Trainer):
    def get_train_dataloader(self):
        return train_dataloader

    def get_test_dataloader(self, test_dataset):
        return test_dataloader

In [77]:
# Setup training arguments
training_args = TrainingArguments(
    output_dir="test",
    max_steps=1000,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=1e-8,
    evaluation_strategy="steps",  # Evaluation
    eval_steps=100,               # Evaluation
#     logging_first_step=True,    # wandb
#     logging_steps=100,          # wandb
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

# Setup trainer
trainer = ClassificationTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)],
    train_dataset = train_encoded_dataset,
    eval_dataset = test_encoded_dataset
)

In [78]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,0.022034,0.99833,0.998934,0.997871,0.998402
200,No log,0.022051,0.99833,0.998934,0.997871,0.998402
300,No log,0.022059,0.99833,0.998934,0.997871,0.998402
400,No log,0.022078,0.99833,0.998934,0.997871,0.998402
500,0.005900,0.0221,0.99833,0.998934,0.997871,0.998402
600,0.005900,0.022044,0.99833,0.998934,0.997871,0.998402
700,0.005900,0.022033,0.99833,0.998934,0.997871,0.998402
800,0.005900,0.022027,0.99833,0.998934,0.997871,0.998402




TrainOutput(global_step=800, training_loss=0.01344081163406372, metrics={'train_runtime': 2694.5438, 'train_samples_per_second': 1.484, 'train_steps_per_second': 0.371, 'total_flos': 4209776885760000.0, 'train_loss': 0.01344081163406372, 'epoch': 0.45})

In [80]:
model.eval()
text = dataframe["cleaned_text"][0]
encoded_input = tokenizer(text, return_tensors="pt", truncation = True, padding = "max_length")
encoded_input.to(device)
output = model(**encoded_input)
logits = output.logits

In [81]:
predicted_class_idx = torch.argmax(logits, dim=1).item()

In [82]:
id2label[predicted_class_idx]

'True'

In [84]:
dataframe["cleaned_text"][0]

'washington reuters head conservative republican faction us congress voted month huge expansion national debt pay tax cuts called fiscal conservative sunday urged budget restraint keeping sharp pivot way among republicans us representative mark meadows speaking cbs face nation drew hard line federal spending lawmakers bracing battle january return holidays wednesday lawmakers begin trying pass federal budget fight likely linked issues immigration policy even november congressional election campaigns approach republicans seek keep control congress president donald trump republicans want big budget increase military spending democrats also want proportional increases nondefense discretionary spending programs support education scientific research infrastructure public health environmental protection trump administration already willing say going increase nondefense discretionary spending percent meadows chairman small influential house freedom caucus said program democrats saying enough 

In [83]:
id2label

{0: 'True', 1: 'Fake'}

In [91]:
from transformers import pipeline

# setup pipeline as a text classification
fake_news_classifier = pipeline(
    task='text-classification',
    model=model,
    tokenizer=tokenizer,
    device=torch.cuda.current_device(),
)

In [89]:
true_df.loc[0, "text"]

'WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educat

In [93]:
text = """
WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S.
Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, 
called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January.
When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. 
"""
fake_news_classifier(text)

[{'label': 'True', 'score': 0.9999998807907104}]

In [94]:
fake_df.loc[0, "text"]

'Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president sends a New Year s greeting like this despicable, petty, infantile gibberish? Only Trump! His lack of decency won t ev

In [95]:
text = """
Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that.
Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.
The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, 
and even the very dishonest Fake News Media, a Happy and Healthy New Year. 
"""

fake_news_classifier(text)

[{'label': 'Fake', 'score': 0.9999998807907104}]