In [1]:
# Install Model maker
!pip install -q tflite-model-maker &> /dev/null

In [2]:
# Imports necessary libraries
import numpy as np
import pandas as pd
import os
import glob
import warnings
from tflite_model_maker import configs
from tflite_model_maker import ExportFormat
from tflite_model_maker import model_spec
from tflite_model_maker import text_classifier
from tflite_model_maker.text_classifier import DataLoader

# Imports and check that we are using TF 2.x
import tensorflow as tf
assert tf.__version__.startswith('2')
tf.get_logger().setLevel('ERROR')
#kernel setttings
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_rows', 25000)

#### **Import dataset**
Import the true and fake news dataset and read them as CSV files using the Pandas library.

In [3]:
#import and mount google drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
# Change directory to the folder where the downloaded data was in your drive
%cd /content/gdrive/My Drive/run-am data

/content/gdrive/My Drive/run-am data


In [5]:
# Get xlsx files list from a folder holding them
path = '/content/gdrive/My Drive/run-am data'
excel_files = glob.glob(path + "/*.xlsx")
# Read each xlsx file into DataFrame
# This creates a list of dataframes
df_list = (pd.read_excel(file) for file in excel_files)

In [6]:
# Concatenate all DataFrames in the data folder
big_df   = pd.concat(df_list, ignore_index=True)

In [7]:
big_df.shape

(26409, 4)

In [8]:
big_df.head()

Unnamed: 0,News-Headline,News-Source,Date,Publisher
0,Appeal court sets aside judgement that voided ...,Unverified,2022-11-05 00:00:00,Linda Ikeji
1,Nnamdi Kanu to appear in court May 18 —Defence...,Unverified,2022-11-05 00:00:00,Linda Ikeji
2,The outrageous cost of party nomination form w...,Unverified,2022-11-05 00:00:00,Linda Ikeji
3,2023 Presidency should go to South East - Obas...,Unverified,2022-11-05 00:00:00,LindaIkeji
4,President Buhari rejects call for tenure exten...,Unverified,2022-10-05 00:00:00,Linda-Ikeji


In [9]:
big_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26409 entries, 0 to 26408
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   News-Headline  26409 non-null  object
 1   News-Source    26408 non-null  object
 2   Date           26409 non-null  object
 3   Publisher      26409 non-null  object
dtypes: object(4)
memory usage: 825.4+ KB


In [10]:
big_df["News-Source"].value_counts(dropna = False)

Verified      13424
Unverified    12980
unverified        4
NaN               1
Name: News-Source, dtype: int64

In [11]:
#Treating NA
nan_value = big_df[big_df['News-Source'].isna()]
nan_value

Unnamed: 0,News-Headline,News-Source,Date,Publisher
493,"2023: Buhari’s Minister, Pauline Tallen Declar...",,2022-08-05 00:00:00,franktalknow


In [12]:
big_df["News-Source"] = big_df["News-Source"].replace("unverified", "Unverified")
big_df["News-Source"] = big_df["News-Source"].replace(np.nan, "Unverified")

In [13]:
big_df["Text"] = big_df["News-Headline"] + " " + "--" + " " + big_df["Publisher"]

In [14]:
#find duplicate rows across a column of interest
duplicateRows = big_df[big_df.duplicated(['Text'])]

In [15]:
# dropping ALL duplicate values
big_df.drop_duplicates(subset ="Text",
                     keep = "first", inplace = True)

In [16]:
from sklearn.utils import shuffle

# Purify
big_df = big_df.iloc[:,[-1, 1]]

# Shuffle
big_df = shuffle(big_df).reset_index(drop=True)

display(big_df)

Unnamed: 0,Text,News-Source
0,"Support Me To Bring Back Good Old Days, Anyim ...",Unverified
1,2023: Fayose allegedly sacks aide for voting A...,Unverified
2,We Are In Mood Of Winning 2023 Polls – APC -- ...,Unverified
3,2023 presidency: Labour Party reveals region P...,Verified
4,Why I Deliberately Wore Sneakers To NBA Confer...,Unverified
...,...,...
25814,Osun 2022: PDP will recover from defeat by APC...,Verified
25815,"A Stingy Man Can’t Be Nigeria’s President, Mba...",Unverified
25816,The police order an investigation into the att...,Unverified
25817,2023 Presidency: Chinese Govt Clears Air On Ba...,Unverified


In [17]:
train_val_df = big_df.sample(frac = 0.8)
test_df = big_df.drop(train_val_df.index)

train_df = train_val_df.sample(frac = 0.8)
val_df = train_val_df.drop(train_df.index)

# Reset Index
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print('trainset size:', train_df.shape)
print('valset size:', val_df.shape)
print('testset size:', test_df.shape)

trainset size: (16524, 2)
valset size: (4131, 2)
testset size: (5164, 2)


In [18]:
train_df.to_csv('traindata.csv',  index=False)
val_df.to_csv('valdata.csv',  index=False)
test_df.to_csv('testdata.csv', index=False)

#### **Choose a model architecture**
Choose any  one of the model architectures of your choice and comment the rest. Each model architecture is different from the other and will yield different results. The MobileBERT model takes more time to train as its architecture is quite complex. However, feel free to play with different architectures until you find the best result.

In [20]:
# Use a model spec from model maker. Options are 'mobilebert_classifier', 'bert_classifier' and 'average_word_vec'
# The first 2 are the BERT models, which is accurate, but larger and slower to train
# Average Word Vec is kinda like transfer learning where there are pre-trained word weights
# and dictionaries. We will use it here.
spec = model_spec.get('average_word_vec')
spec.num_words = 2000
spec.seq_len = 20
spec.wordvec_dim = 7

In [21]:
# Load the CSV using DataLoader.from_csv to make the training_data
train_data = DataLoader.from_csv(
      filename='traindata.csv',
      text_column='Text',
      label_column='News-Source',
      model_spec=spec,
      is_training=True)

test_data = DataLoader.from_csv(
      filename='testdata.csv',
      text_column='Text',
      label_column='News-Source',
      model_spec=spec,
      is_training=False) 
val_data = DataLoader.from_csv(
      filename='valdata.csv',
      text_column='Text',
      label_column='News-Source',
      model_spec=spec,
      is_training=False) 

In [22]:
# Build the model
model = text_classifier.create(train_data, model_spec=spec, epochs=50, validation_data=test_data)

Epoch 2/2
Epoch 3/3
Epoch 4/4
Epoch 5/5
Epoch 6/6
Epoch 7/7
Epoch 8/8
Epoch 9/9
Epoch 10/10
Epoch 11/11
Epoch 12/12
Epoch 13/13
Epoch 14/14
Epoch 15/15
Epoch 16/16
Epoch 17/17
Epoch 18/18
Epoch 19/19
Epoch 20/20
Epoch 21/21
Epoch 22/22
Epoch 23/23
Epoch 24/24
Epoch 25/25
Epoch 26/26
Epoch 27/27
Epoch 28/28
Epoch 29/29
Epoch 30/30
Epoch 31/31
Epoch 32/32
Epoch 33/33
Epoch 34/34
Epoch 35/35
Epoch 36/36
Epoch 37/37
Epoch 38/38
Epoch 39/39
Epoch 40/40
Epoch 41/41
Epoch 42/42
Epoch 43/43
Epoch 44/44
Epoch 45/45
Epoch 46/46
Epoch 47/47
Epoch 48/48
Epoch 49/49
Epoch 50/50


In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 7)             14021     
                                                                 
 global_average_pooling1d (G  (None, 7)                0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 7)                 56        
                                                                 
 dropout (Dropout)           (None, 7)                 0         
                                                                 
 dense_1 (Dense)             (None, 2)                 16        
                                                                 
Total params: 14,093
Trainable params: 14,093
Non-trainable params: 0
____________________________________________________

In [24]:
loss, acc = model.evaluate(test_data)



In [25]:
loss, acc = model.evaluate(val_data)



In [26]:
# This will export to SavedModel format with the model, vocabulary and labels. 
model.export(export_dir='/mm_runam_savedmodel/', export_format=[ExportFormat.LABEL, ExportFormat.VOCAB, ExportFormat.SAVED_MODEL])

# You can find your files in colab by clicking the 'folder' tab to the left of
# this code window, and then navigating 'up' a directory to find the root
# directory listing -- and from there you should see /mm_runam_savedmodel/

In [27]:
# Rename the SavedModel subfolder to a version number
!mv /mm_runam_savedmodel/saved_model /mm_runam_savedmodel/123
!zip -r mm_runam_savedmodel.zip /mm_runam_savedmodel/ 

  adding: mm_runam_savedmodel/ (stored 0%)
  adding: mm_runam_savedmodel/labels.txt (deflated 21%)
  adding: mm_runam_savedmodel/123/ (stored 0%)
  adding: mm_runam_savedmodel/123/keras_metadata.pb (deflated 86%)
  adding: mm_runam_savedmodel/123/variables/ (stored 0%)
  adding: mm_runam_savedmodel/123/variables/variables.data-00000-of-00001 (deflated 10%)
  adding: mm_runam_savedmodel/123/variables/variables.index (deflated 59%)
  adding: mm_runam_savedmodel/123/saved_model.pb (deflated 87%)
  adding: mm_runam_savedmodel/123/assets/ (stored 0%)
  adding: mm_runam_savedmodel/vocab.txt (deflated 48%)


In [None]:
# Optional extra
# You can use this cell to export details for projector.tensorflow.org
# Where you can explore the embeddings that were learned for this dataset
embeddings = model.model.layers[0]
weights = embeddings.get_weights()[0]
tokenizer = model.model_spec.vocab

import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word in tokenizer:
  #word = tokenizer.decode([word_num])
  value = tokenizer[word]
  embeddings = weights[value]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()


try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>