In [None]:
# allow access to folder of data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!gcloud auth application-default login

Go to the following link in your browser:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&state=ERitUQQh4J1sRxnxEHwabWD9T25Qo3&prompt=consent&access_type=offline&code_challenge=wttbZ27L5O-DPFcpod1zSTZ0STv5HLJgH1Ak_zc2kIA&code_challenge_method=S256

Enter verification code: 4/1AY0e-g4DWaDpJRDtgR5sby_ooyz4A51uTI0W8N_YGvb5u5yAw9IYbDzxKPY

Credentials saved to file: [/content/.config/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).
Cannot find a quota project to add to ADC. You might receive a "quota exceeded" or "API not enabled" error. Run $ gcloud auth application-defaul

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 12.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.94


In [None]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
import tokenization

In [None]:
# helper function for encoding input texts
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# helper function for making the NN
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:

# import the pre-trained BERT model
bert_module = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(bert_module, trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/fake_news/alldata_with_sentiment.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,0,covid started because we eat animals,vegan instagram users are pinning the coronavi...,fake,0.010846,0.088485,0.899144,0.001525
1,1,says michelle obama has people on her staff na...,glenn beck rekindled a falsehood about the siz...,fake,0.010427,0.332251,0.498293,0.15903
2,2,says president donald trump has signed more la...,vice president mike pence says that when it co...,real,0.011365,0.204091,0.781942,0.002602
3,3,us representatives promise implement of un gu...,a conservative website falsely claimed that u ...,fake,0.007493,0.347418,0.56576,0.079329
4,4,the federal government borrows billion every ...,hundreds of rhode islanders got phone calls la...,real,0.017309,0.313422,0.630314,0.038955


In [None]:
df_test = df[-2000:]
df_test

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
15487,15487,says mitt romney belittled middle class tax cu...,as the president pressured congress to extend ...,fake,0.068606,0.183783,0.330552,0.417059
15488,15488,it not true that since he been the president e...,in an interview on abc this week host christia...,fake,0.003054,0.135043,0.861756,0.000148
15489,15489,the governor does not have any power to veto ...,as the debate over same sex marriage heats up ...,real,0.007676,0.041052,0.950841,0.000431
15490,15490,nearly wisconsinites have lost their job sinc...,in september gov scott walker responded to dis...,fake,0.004812,0.260877,0.733812,0.000499
15491,15491,during the reagan era while productivity incre...,in his new film capitalism a love story michae...,real,0.059160,0.028125,0.907046,0.005669
...,...,...,...,...,...,...,...,...
17482,17482,historically senate ratification of arms cont...,as the house and senate move into a brief lame...,real,0.006171,0.045163,0.934267,0.014399
17483,17483,since the affordable care act passed percent ...,policymakers and pundits are spending a lot of...,real,0.005834,0.292535,0.688702,0.012930
17484,17484,medicare spends billion a year on subsidies to...,in the final presidential debate oct moderator...,real,0.006582,0.557617,0.433883,0.001918
17485,17485,the obama administration is allowing state wai...,former president bill clinton used his elder s...,real,0.008514,0.020729,0.967239,0.003517


In [None]:
df_train = df[:-2000]
df_train

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,0,covid started because we eat animals,vegan instagram users are pinning the coronavi...,fake,0.010846,0.088485,0.899144,0.001525
1,1,says michelle obama has people on her staff na...,glenn beck rekindled a falsehood about the siz...,fake,0.010427,0.332251,0.498293,0.159030
2,2,says president donald trump has signed more la...,vice president mike pence says that when it co...,real,0.011365,0.204091,0.781942,0.002602
3,3,us representatives promise implement of un gu...,a conservative website falsely claimed that u ...,fake,0.007493,0.347418,0.565760,0.079329
4,4,the federal government borrows billion every ...,hundreds of rhode islanders got phone calls la...,real,0.017309,0.313422,0.630314,0.038955
...,...,...,...,...,...,...,...,...
15482,15482,eleven states complete their legislative sess...,when it comes to legislative sessions kenneth ...,real,0.016221,0.126472,0.849048,0.008258
15483,15483,says oregon state employees received a catch u...,the public employees retirement system is the ...,fake,0.008677,0.121374,0.868084,0.001865
15484,15484,californias prisons budgetin was about percent...,democratic gov jerry brown reeled off statisti...,real,0.004268,0.281492,0.711625,0.002615
15485,15485,state lawmakers are spending taxpayer money fo...,georgia taxpayers are already on the hook for ...,real,0.013445,0.129025,0.705473,0.152057


In [None]:
df_fake = df_train[df_train['label'] == 'fake']
df_real = df_train[df_train['label'] == 'real']
print(df_fake.shape)
print(df_real.shape)

(7740, 8)
(7747, 8)


In [None]:
df_real = df_real.sample(n=7740, random_state=42)
df_balanced = df_real.append(df_fake)
print(df_balanced.shape)

(15480, 8)


In [None]:
df_balanced.head()

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
3422,3422,george allen voted to end pell grants for stud...,democratic senate candidate tim kaine says tha...,real,0.002265,0.007057,0.990371,0.000308
2299,2299,passage of clinton budget bill in led to an en...,on the april edition of abc this week interim ...,real,0.016258,0.027147,0.956127,0.000468
7139,7139,today we have more boots on the ground near t...,president barack obama gave a lengthy speech o...,real,0.013704,0.09227,0.436795,0.457231
10750,10750,a usda study found that percent of chicken ca...,before you finish eating that piece of chicken...,real,0.007784,0.462476,0.526185,0.003555
270,270,saysdonald trump called pregnant employees an ...,democratic presidential candidate hillary clin...,real,0.005912,0.297114,0.696331,0.000644


In [None]:
# shuffle data
df_balanced = df_balanced.sample(frac=1, random_state = 42).reset_index(drop=True)

In [None]:
# make sure data is shuffled
df_balanced.head()

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,10007,polls show americans are overwhelmingly oppos...,rick santorum in an op ed for u s news and wor...,real,0.002503,0.315966,0.680828,0.000703
1,6205,says a video shows drone footage of the killin...,not long after a u s drone strike killed irani...,fake,0.00506,0.322271,0.63915,0.03352
2,14958,says paul walker was set to come forward to ex...,a recent facebook post pushes a persistent con...,fake,0.031657,0.316563,0.634967,0.016814
3,13950,ed gillespies book promoted an individual heal...,republican senate candidate ed gillespie rarel...,real,0.002494,0.125325,0.865095,0.007086
4,4402,eighty seven percent of private insurance pla...,the use of federal dollars for most abortions ...,real,0.01029,0.164166,0.757481,0.068063


In [None]:
df_balanced.label = df_balanced.label.replace({'real': 0, 'fake': 1})
df_balanced.head()

Unnamed: 0.1,Unnamed: 0,statement,text,label,positive,negative,neutral,mixed
0,10007,polls show americans are overwhelmingly oppos...,rick santorum in an op ed for u s news and wor...,0,0.002503,0.315966,0.680828,0.000703
1,6205,says a video shows drone footage of the killin...,not long after a u s drone strike killed irani...,1,0.00506,0.322271,0.63915,0.03352
2,14958,says paul walker was set to come forward to ex...,a recent facebook post pushes a persistent con...,1,0.031657,0.316563,0.634967,0.016814
3,13950,ed gillespies book promoted an individual heal...,republican senate candidate ed gillespie rarel...,0,0.002494,0.125325,0.865095,0.007086
4,4402,eighty seven percent of private insurance pla...,the use of federal dollars for most abortions ...,0,0.01029,0.164166,0.757481,0.068063


In [None]:
train_input = bert_encode(df_balanced.statement.values, tokenizer)

In [None]:
train_labels = df_balanced.label.values
model = build_model(bert_layer)

In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=2,
    batch_size=5
)

In [None]:
model_json = model.to_json()
with open("/content/drive/My Drive/Colab Notebooks/data/fake_news/models/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("/content/drive/My Drive/Colab Notebooks/data/fake_news/models/model.h5")

In [None]:
test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/data/fake_news/fnn_test_trim.csv")
print(test.shape)
test.head()

(1054, 4)


Unnamed: 0.1,Unnamed: 0,statement,fullText_based_content,label_fnn
0,0,"""President Obama himself attempted to filibust...",U.S. Supreme Court Justice John Paul Stevens a...,real
1,1,"In Hawaii, ""they don't have a history of throw...","On ABC's This Week, the chairmen of the Republ...",real
2,2,"""Our national debt ... is on track to exceed t...",Ever since Barack Obama became president and b...,real
3,3,"""Health insurance companies' costs are only 4 ...",As the battle over health care reform approach...,real
4,4,"""We can prevent terror suspects from boarding ...",In the wake of a foiled car bomb attempt in Ti...,real


In [None]:
forfun = test[:100]
forfun.head()

Unnamed: 0.1,Unnamed: 0,statement,fullText_based_content,label_fnn
0,0,"""President Obama himself attempted to filibust...",U.S. Supreme Court Justice John Paul Stevens a...,real
1,1,"In Hawaii, ""they don't have a history of throw...","On ABC's This Week, the chairmen of the Republ...",real
2,2,"""Our national debt ... is on track to exceed t...",Ever since Barack Obama became president and b...,real
3,3,"""Health insurance companies' costs are only 4 ...",As the battle over health care reform approach...,real
4,4,"""We can prevent terror suspects from boarding ...",In the wake of a foiled car bomb attempt in Ti...,real


In [None]:
test_input = bert_encode(forfun.statement.values, tokenizer)
preds = model.predict(test_input)

In [None]:
big_test = bert_encode(test.statement.values, tokenizer)
big_test_preds = model.predict(big_test)

In [None]:
predictions = pd.DataFrame(data=big_test_preds)

In [None]:
test.insert(4, "prediction", big_test_preds)

In [None]:
test

Unnamed: 0.1,Unnamed: 0,statement,fullText_based_content,label_fnn,prediction
0,0,"""President Obama himself attempted to filibust...",U.S. Supreme Court Justice John Paul Stevens a...,real,0.283671
1,1,"In Hawaii, ""they don't have a history of throw...","On ABC's This Week, the chairmen of the Republ...",real,0.036096
2,2,"""Our national debt ... is on track to exceed t...",Ever since Barack Obama became president and b...,real,0.346252
3,3,"""Health insurance companies' costs are only 4 ...",As the battle over health care reform approach...,real,0.055817
4,4,"""We can prevent terror suspects from boarding ...",In the wake of a foiled car bomb attempt in Ti...,real,0.071138
...,...,...,...,...,...
1049,1049,"""Gun violence is by far the leading cause of d...","In a speech posted on Medium , Democratic pres...",real,0.274552
1050,1050,"The economy is ""creating jobs at the fastest p...",President Barack Obama’s State of the Union ad...,real,0.083449
1051,1051,"Like Marco Rubio, Sen. Barack Obama had ""one o...",A commercial by the super PAC Right to Rise US...,real,0.009752
1052,1052,"""Bernie Sanders passed more roll call amendmen...","Bernie Sanders is often criticized for "" pie -...",real,0.025411


In [None]:
test['prediction'] = (test['prediction'] > 0.5).astype(int)

In [None]:
test

Unnamed: 0.1,Unnamed: 0,statement,fullText_based_content,label_fnn,prediction
0,0,"""President Obama himself attempted to filibust...",U.S. Supreme Court Justice John Paul Stevens a...,real,0
1,1,"In Hawaii, ""they don't have a history of throw...","On ABC's This Week, the chairmen of the Republ...",real,0
2,2,"""Our national debt ... is on track to exceed t...",Ever since Barack Obama became president and b...,real,0
3,3,"""Health insurance companies' costs are only 4 ...",As the battle over health care reform approach...,real,0
4,4,"""We can prevent terror suspects from boarding ...",In the wake of a foiled car bomb attempt in Ti...,real,0
...,...,...,...,...,...
1049,1049,"""Gun violence is by far the leading cause of d...","In a speech posted on Medium , Democratic pres...",real,0
1050,1050,"The economy is ""creating jobs at the fastest p...",President Barack Obama’s State of the Union ad...,real,0
1051,1051,"Like Marco Rubio, Sen. Barack Obama had ""one o...",A commercial by the super PAC Right to Rise US...,real,0
1052,1052,"""Bernie Sanders passed more roll call amendmen...","Bernie Sanders is often criticized for "" pie -...",real,0


In [None]:
test['label_fnn'] = test.label_fnn.replace({'real': 0, 'fake': 1})

In [None]:
test

Unnamed: 0.1,Unnamed: 0,statement,fullText_based_content,label_fnn,prediction
0,0,"""President Obama himself attempted to filibust...",U.S. Supreme Court Justice John Paul Stevens a...,0,0
1,1,"In Hawaii, ""they don't have a history of throw...","On ABC's This Week, the chairmen of the Republ...",0,0
2,2,"""Our national debt ... is on track to exceed t...",Ever since Barack Obama became president and b...,0,0
3,3,"""Health insurance companies' costs are only 4 ...",As the battle over health care reform approach...,0,0
4,4,"""We can prevent terror suspects from boarding ...",In the wake of a foiled car bomb attempt in Ti...,0,0
...,...,...,...,...,...
1049,1049,"""Gun violence is by far the leading cause of d...","In a speech posted on Medium , Democratic pres...",0,0
1050,1050,"The economy is ""creating jobs at the fastest p...",President Barack Obama’s State of the Union ad...,0,0
1051,1051,"Like Marco Rubio, Sen. Barack Obama had ""one o...",A commercial by the super PAC Right to Rise US...,0,0
1052,1052,"""Bernie Sanders passed more roll call amendmen...","Bernie Sanders is often criticized for "" pie -...",0,0
